From 3ba63c3c3cb72950adc4c9699fcfa1a72796a041 Mon Sep 17 00:00:00 2001
From: Yunsong Wang <yunsongw@nvidia.com>
Date: Thu, 15 Feb 2024 10:50:54 -0800
Subject: [PATCH] Update cudf for compatibility with the latest cuco (#14849)

Depends on https://github.com/rapidsai/rapids-cmake/pull/526

CMakes changes will be reverted once https://github.com/rapidsai/rapids-cmake/pull/526 is merged.

This PR updates libcudf to make it compatible with the latest cuco.

Authors:
  - Yunsong Wang (https://github.com/PointKernel)

Approvers:
  - David Wendt (https://github.com/davidwendt)
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14849
---
 .../cudf/detail/hash_reduce_by_row.cuh        |  4 +-
 cpp/include/cudf/detail/join.hpp              |  2 +-
 cpp/src/io/json/json_tree.cu                  | 32 ++++++++-------
 cpp/src/io/orc/orc_gpu.hpp                    |  4 +-
 cpp/src/io/parquet/parquet_gpu.cuh            |  4 +-
 cpp/src/join/join_common_utils.hpp            | 13 ++++---
 cpp/src/search/contains_table.cu              | 19 ++++-----
 cpp/src/stream_compaction/distinct_count.cu   | 15 +++----
 .../stream_compaction_common.hpp              |  4 +-
 cpp/src/text/bpe/byte_pair_encoding.cuh       | 39 ++++++++++---------
 cpp/src/text/bpe/load_merge_pairs.cu          |  4 ++
 cpp/src/text/vocabulary_tokenize.cu           | 22 ++++++-----
 12 files changed, 89 insertions(+), 73 deletions(-)

diff --git a/cpp/include/cudf/detail/hash_reduce_by_row.cuh b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
index 006cb5142c9..a740b5c4e93 100644
--- a/cpp/include/cudf/detail/hash_reduce_by_row.cuh
+++ b/cpp/include/cudf/detail/hash_reduce_by_row.cuh
@@ -31,8 +31,8 @@
 
 namespace cudf::detail {
 
-using hash_map_type =
-  cuco::static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
+using hash_map_type = cuco::legacy::
+  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
 
 /**
  * @brief The base struct for customized reduction functor to perform reduce-by-key with keys are
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index ad6269dae30..27d14874bce 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -62,7 +62,7 @@ struct hash_join {
                           cudf::size_type,
                           cuda::thread_scope_device,
                           cudf::detail::cuco_allocator,
-                          cuco::double_hashing<DEFAULT_JOIN_CG_SIZE, Hasher, Hasher>>;
+                          cuco::legacy::double_hashing<DEFAULT_JOIN_CG_SIZE, Hasher, Hasher>>;
 
   hash_join()                            = delete;
   ~hash_join()                           = default;
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index db9daf28c06..148aeb5ec7a 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -548,13 +548,14 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
   using hasher_type                             = decltype(d_hasher);
   constexpr size_type empty_node_index_sentinel = -1;
   auto key_set =
-    cuco::experimental::static_set{cuco::experimental::extent{compute_hash_table_size(
-                                     num_fields, 40)},  // 40% occupancy in hash map
-                                   cuco::empty_key{empty_node_index_sentinel},
-                                   d_equal,
-                                   cuco::experimental::linear_probing<1, hasher_type>{d_hasher},
-                                   cudf::detail::cuco_allocator{stream},
-                                   stream.value()};
+    cuco::static_set{cuco::extent{compute_hash_table_size(num_fields, 40)},  // 40% occupancy
+                     cuco::empty_key{empty_node_index_sentinel},
+                     d_equal,
+                     cuco::linear_probing<1, hasher_type>{d_hasher},
+                     {},
+                     {},
+                     cudf::detail::cuco_allocator{stream},
+                     stream.value()};
   key_set.insert_if_async(iter,
                           iter + num_nodes,
                           thrust::counting_iterator<size_type>(0),  // stencil
@@ -562,7 +563,7 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
                           stream.value());
 
   auto const get_hash_value =
-    [key_set = key_set.ref(cuco::experimental::op::find)] __device__(auto node_id) -> size_type {
+    [key_set = key_set.ref(cuco::op::find)] __device__(auto node_id) -> size_type {
     auto const it = key_set.find(node_id);
     return (it == key_set.end()) ? size_type{0} : *it;
   };
@@ -735,13 +736,14 @@ std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_n
   constexpr size_type empty_node_index_sentinel = -1;
   using hasher_type                             = decltype(d_hashed_cache);
 
-  auto key_set = cuco::experimental::static_set{
-    cuco::experimental::extent{compute_hash_table_size(num_nodes)},
-    cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
-    d_equal,
-    cuco::experimental::linear_probing<1, hasher_type>{d_hashed_cache},
-    cudf::detail::cuco_allocator{stream},
-    stream.value()};
+  auto key_set = cuco::static_set{cuco::extent{compute_hash_table_size(num_nodes)},
+                                  cuco::empty_key<cudf::size_type>{empty_node_index_sentinel},
+                                  d_equal,
+                                  cuco::linear_probing<1, hasher_type>{d_hashed_cache},
+                                  {},
+                                  {},
+                                  cudf::detail::cuco_allocator{stream},
+                                  stream.value()};
 
   // insert and convert node ids to unique set ids
   auto nodes_itr         = thrust::make_counting_iterator<size_type>(0);
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index 243704b65d4..c2570d71c24 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -44,7 +44,7 @@ using cudf::detail::host_2dspan;
 auto constexpr KEY_SENTINEL   = size_type{-1};
 auto constexpr VALUE_SENTINEL = size_type{-1};
 
-using map_type = cuco::static_map<size_type, size_type>;
+using map_type = cuco::legacy::static_map<size_type, size_type>;
 
 /**
  * @brief The alias of `map_type::pair_atomic_type` class.
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
index 10e12ebb782..e3c44c78898 100644
--- a/cpp/src/io/parquet/parquet_gpu.cuh
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -28,7 +28,7 @@ namespace cudf::io::parquet::detail {
 auto constexpr KEY_SENTINEL   = size_type{-1};
 auto constexpr VALUE_SENTINEL = size_type{-1};
 
-using map_type = cuco::static_map<size_type, size_type>;
+using map_type = cuco::legacy::static_map<size_type, size_type>;
 
 /**
  * @brief The alias of `map_type::pair_atomic_type` class.
diff --git a/cpp/src/join/join_common_utils.hpp b/cpp/src/join/join_common_utils.hpp
index b88a4fdef58..4d361b23502 100644
--- a/cpp/src/join/join_common_utils.hpp
+++ b/cpp/src/join/join_common_utils.hpp
@@ -45,13 +45,14 @@ using multimap_type = cudf::hash_join::impl_type::map_type;
 // Multimap type used for mixed joins. TODO: This is a temporary alias used
 // until the mixed joins are converted to using CGs properly. Right now it's
 // using a cooperative group of size 1.
-using mixed_multimap_type = cuco::static_multimap<hash_value_type,
-                                                  size_type,
-                                                  cuda::thread_scope_device,
-                                                  cudf::detail::cuco_allocator,
-                                                  cuco::double_hashing<1, hash_type, hash_type>>;
+using mixed_multimap_type =
+  cuco::static_multimap<hash_value_type,
+                        size_type,
+                        cuda::thread_scope_device,
+                        cudf::detail::cuco_allocator,
+                        cuco::legacy::double_hashing<1, hash_type, hash_type>>;
 
-using semi_map_type = cuco::
+using semi_map_type = cuco::legacy::
   static_map<hash_value_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
 
 using row_hash_legacy =
diff --git a/cpp/src/search/contains_table.cu b/cpp/src/search/contains_table.cu
index ce069abcb78..e1d0fab6025 100644
--- a/cpp/src/search/contains_table.cu
+++ b/cpp/src/search/contains_table.cu
@@ -158,9 +158,9 @@ void dispatch_nan_comparator(
   // Distinguish probing scheme CG sizes between nested and flat types for better performance
   auto const probing_scheme = [&]() {
     if constexpr (HasNested) {
-      return cuco::experimental::linear_probing<4, Hasher>{d_hasher};
+      return cuco::linear_probing<4, Hasher>{d_hasher};
     } else {
-      return cuco::experimental::linear_probing<1, Hasher>{d_hasher};
+      return cuco::linear_probing<1, Hasher>{d_hasher};
     }
   }();
 
@@ -228,13 +228,14 @@ rmm::device_uvector<bool> contains(table_view const& haystack,
     [&](auto const& d_self_equal, auto const& d_two_table_equal, auto const& probing_scheme) {
       auto const d_equal = comparator_adapter{d_self_equal, d_two_table_equal};
 
-      auto set = cuco::experimental::static_set{
-        cuco::experimental::extent{compute_hash_table_size(haystack.num_rows())},
-        cuco::empty_key{lhs_index_type{-1}},
-        d_equal,
-        probing_scheme,
-        cudf::detail::cuco_allocator{stream},
-        stream.value()};
+      auto set = cuco::static_set{cuco::extent{compute_hash_table_size(haystack.num_rows())},
+                                  cuco::empty_key{lhs_index_type{-1}},
+                                  d_equal,
+                                  probing_scheme,
+                                  {},
+                                  {},
+                                  cudf::detail::cuco_allocator{stream},
+                                  stream.value()};
 
       if (haystack_has_nulls && compare_nulls == null_equality::UNEQUAL) {
         auto const bitmask_buffer_and_ptr = build_row_bitmask(haystack, stream);
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 507bad777eb..3ec1be42bfe 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -141,13 +141,14 @@ cudf::size_type distinct_count(table_view const& keys,
 
   auto const comparator_helper = [&](auto const row_equal) {
     using hasher_type = decltype(hash_key);
-    auto key_set =
-      cuco::experimental::static_set{cuco::experimental::extent{compute_hash_table_size(num_rows)},
-                                     cuco::empty_key<cudf::size_type>{-1},
-                                     row_equal,
-                                     cuco::experimental::linear_probing<1, hasher_type>{hash_key},
-                                     cudf::detail::cuco_allocator{stream},
-                                     stream.value()};
+    auto key_set      = cuco::static_set{cuco::extent{compute_hash_table_size(num_rows)},
+                                    cuco::empty_key<cudf::size_type>{-1},
+                                    row_equal,
+                                    cuco::linear_probing<1, hasher_type>{hash_key},
+                                         {},
+                                         {},
+                                    cudf::detail::cuco_allocator{stream},
+                                    stream.value()};
 
     auto const iter = thrust::counting_iterator<cudf::size_type>(0);
     // when nulls are equal, we skip hashing any row that has a null
diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp
index ceb62d1d059..dd7d76168d9 100644
--- a/cpp/src/stream_compaction/stream_compaction_common.hpp
+++ b/cpp/src/stream_compaction/stream_compaction_common.hpp
@@ -27,8 +27,8 @@
 namespace cudf {
 namespace detail {
 
-using hash_map_type =
-  cuco::static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
+using hash_map_type = cuco::legacy::
+  static_map<size_type, size_type, cuda::thread_scope_device, cudf::detail::cuco_allocator>;
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/src/text/bpe/byte_pair_encoding.cuh b/cpp/src/text/bpe/byte_pair_encoding.cuh
index 1a3f8eadea0..02a8a6c4d0a 100644
--- a/cpp/src/text/bpe/byte_pair_encoding.cuh
+++ b/cpp/src/text/bpe/byte_pair_encoding.cuh
@@ -44,6 +44,7 @@ namespace detail {
 using string_hasher_type = cudf::hashing::detail::MurmurHash3_x86_32<cudf::string_view>;
 using hash_value_type    = string_hasher_type::result_type;
 using merge_pair_type    = thrust::pair<cudf::string_view, cudf::string_view>;
+using cuco_storage       = cuco::storage<1>;
 
 /**
  * @brief Hasher function used for building and using the cuco static-map
@@ -98,15 +99,16 @@ struct bpe_equal {
   }
 };
 
-using bpe_probe_scheme = cuco::experimental::linear_probing<1, bpe_hasher>;
+using bpe_probe_scheme = cuco::linear_probing<1, bpe_hasher>;
 
-using merge_pairs_map_type = cuco::experimental::static_map<cudf::size_type,
-                                                            cudf::size_type,
-                                                            cuco::experimental::extent<std::size_t>,
-                                                            cuda::thread_scope_device,
-                                                            bpe_equal,
-                                                            bpe_probe_scheme,
-                                                            cudf::detail::cuco_allocator>;
+using merge_pairs_map_type = cuco::static_map<cudf::size_type,
+                                              cudf::size_type,
+                                              cuco::extent<std::size_t>,
+                                              cuda::thread_scope_device,
+                                              bpe_equal,
+                                              bpe_probe_scheme,
+                                              cudf::detail::cuco_allocator,
+                                              cuco_storage>;
 
 /**
  * @brief Hasher function used for building and using the cuco static-map
@@ -155,15 +157,16 @@ struct mp_equal {
   }
 };
 
-using mp_probe_scheme = cuco::experimental::linear_probing<1, mp_hasher>;
+using mp_probe_scheme = cuco::linear_probing<1, mp_hasher>;
 
-using mp_table_map_type = cuco::experimental::static_map<cudf::size_type,
-                                                         cudf::size_type,
-                                                         cuco::experimental::extent<std::size_t>,
-                                                         cuda::thread_scope_device,
-                                                         mp_equal,
-                                                         mp_probe_scheme,
-                                                         cudf::detail::cuco_allocator>;
+using mp_table_map_type = cuco::static_map<cudf::size_type,
+                                           cudf::size_type,
+                                           cuco::extent<std::size_t>,
+                                           cuda::thread_scope_device,
+                                           mp_equal,
+                                           mp_probe_scheme,
+                                           cudf::detail::cuco_allocator,
+                                           cuco_storage>;
 
 }  // namespace detail
 
@@ -185,8 +188,8 @@ struct bpe_merge_pairs::bpe_merge_pairs_impl {
                        std::unique_ptr<detail::mp_table_map_type>&& mp_table_map);
 
   auto const get_merge_pairs() const { return *d_merge_pairs; }
-  auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::experimental::op::find); }
-  auto get_mp_table_ref() const { return mp_table_map->ref(cuco::experimental::op::find); }
+  auto get_merge_pairs_ref() const { return merge_pairs_map->ref(cuco::op::find); }
+  auto get_mp_table_ref() const { return mp_table_map->ref(cuco::op::find); }
 };
 
 }  // namespace nvtext
diff --git a/cpp/src/text/bpe/load_merge_pairs.cu b/cpp/src/text/bpe/load_merge_pairs.cu
index 3b630886b3e..8da2d745966 100644
--- a/cpp/src/text/bpe/load_merge_pairs.cu
+++ b/cpp/src/text/bpe/load_merge_pairs.cu
@@ -48,6 +48,8 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
                                            cuco::empty_value{-1},
                                            bpe_equal{input},
                                            bpe_probe_scheme{bpe_hasher{input}},
+                                           cuco::thread_scope_device,
+                                           cuco_storage{},
                                            cudf::detail::cuco_allocator{stream},
                                            stream.value());
 
@@ -69,6 +71,8 @@ std::unique_ptr<detail::mp_table_map_type> initialize_mp_table_map(
                                                           cuco::empty_value{-1},
                                                           mp_equal{input},
                                                           mp_probe_scheme{mp_hasher{input}},
+                                                          cuco::thread_scope_device,
+                                                          cuco_storage{},
                                                           cudf::detail::cuco_allocator{stream},
                                                           stream.value());
 
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index c6e90c6fcaa..b6991e534bf 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -93,14 +93,16 @@ struct vocab_equal {
   }
 };
 
-using probe_scheme        = cuco::experimental::linear_probing<1, vocab_hasher>;
-using vocabulary_map_type = cuco::experimental::static_map<cudf::size_type,
-                                                           cudf::size_type,
-                                                           cuco::experimental::extent<std::size_t>,
-                                                           cuda::thread_scope_device,
-                                                           vocab_equal,
-                                                           probe_scheme,
-                                                           cudf::detail::cuco_allocator>;
+using probe_scheme        = cuco::linear_probing<1, vocab_hasher>;
+using cuco_storage        = cuco::storage<1>;
+using vocabulary_map_type = cuco::static_map<cudf::size_type,
+                                             cudf::size_type,
+                                             cuco::extent<std::size_t>,
+                                             cuda::thread_scope_device,
+                                             vocab_equal,
+                                             probe_scheme,
+                                             cudf::detail::cuco_allocator,
+                                             cuco_storage>;
 }  // namespace
 }  // namespace detail
 
@@ -115,7 +117,7 @@ struct tokenize_vocabulary::tokenize_vocabulary_impl {
   col_device_view const d_vocabulary;
   std::unique_ptr<detail::vocabulary_map_type> vocabulary_map;
 
-  auto get_map_ref() const { return vocabulary_map->ref(cuco::experimental::op::find); }
+  auto get_map_ref() const { return vocabulary_map->ref(cuco::op::find); }
 
   tokenize_vocabulary_impl(std::unique_ptr<cudf::column>&& vocab,
                            col_device_view&& d_vocab,
@@ -149,6 +151,8 @@ tokenize_vocabulary::tokenize_vocabulary(cudf::strings_column_view const& input,
     cuco::empty_value{-1},
     detail::vocab_equal{*d_vocabulary},
     detail::probe_scheme{detail::vocab_hasher{*d_vocabulary}},
+    cuco::thread_scope_device,
+    detail::cuco_storage{},
     cudf::detail::cuco_allocator{stream},
     stream.value());