Additional refactoring of hash functions (#10462)

Additional work related to #10081. This is breaking because it reorganizes several public names/namespaces. Summary of changes in this PR: - The `cudf` namespace now wraps the contents of `hash_functions.cuh`, and some public names are now classified as `detail` APIs. - `SparkMurmurHash3_32` has been updated to align with the design and naming conventions of `MurmurHash3_32` Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Jake Hemstad (https://github.com/jrhemstad) - Vyas Ramasubramani (https://github.com/vyasr) URL: #10462
rapidsai · Apr 20, 2022 · c8c7271 · c8c7271
1 parent 5f6b70a
commit c8c7271
Show file tree

Hide file tree

Showing 9 changed files with 147 additions and 134 deletions.
diff --git a/cpp/include/cudf/detail/utilities/hash_functions.cuh b/cpp/include/cudf/detail/utilities/hash_functions.cuh
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
@@ -429,17 +429,19 @@ auto create_hash_map(table_device_view const& d_keys,
   size_type constexpr unused_key{std::numeric_limits<size_type>::max()};
   size_type constexpr unused_value{std::numeric_limits<size_type>::max()};
 
-  using map_type = concurrent_unordered_map<size_type,
-                                            size_type,
-                                            row_hasher<default_hash, nullate::DYNAMIC>,
-                                            row_equality_comparator<nullate::DYNAMIC>>;
+  using map_type =
+    concurrent_unordered_map<size_type,
+                             size_type,
+                             row_hasher<cudf::detail::default_hash, nullate::DYNAMIC>,
+                             row_equality_comparator<nullate::DYNAMIC>>;
 
   using allocator_type = typename map_type::allocator_type;
 
   auto const null_keys_are_equal =
     include_null_keys == null_policy::INCLUDE ? null_equality::EQUAL : null_equality::UNEQUAL;
 
-  row_hasher<default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls}, d_keys};
+  row_hasher<cudf::detail::default_hash, nullate::DYNAMIC> hasher{nullate::DYNAMIC{keys_have_nulls},
+                                                                  d_keys};
   row_equality_comparator rows_equal{
     nullate::DYNAMIC{keys_have_nulls}, d_keys, d_keys, null_keys_are_equal};
 

diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -113,7 +113,7 @@ union pair_packer<pair_type, std::enable_if_t<is_packable<pair_type>()>> {
  */
 template <typename Key,
           typename Element,
-          typename Hasher    = default_hash<Key>,
+          typename Hasher    = cudf::detail::default_hash<Key>,
           typename Equality  = equal_to<Key>,
           typename Allocator = default_allocator<thrust::pair<Key, Element>>>
 class concurrent_unordered_map {

diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
@@ -356,7 +356,7 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
       ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)}
       : [&]() {
           auto const key_range = get_next_key(begin, end, opts.quotechar);
-          auto const key_hash  = MurmurHash3_32<cudf::string_view>{}(
+          auto const key_hash  = cudf::detail::MurmurHash3_32<cudf::string_view>{}(
             cudf::string_view(key_range.first, key_range.second - key_range.first));
           auto const hash_col = col_map.find(key_hash);
           // Fall back to field index if not found (parsing error)
@@ -667,7 +667,8 @@ __global__ void collect_keys_info_kernel(parse_options_view const options,
       keys_info->column(0).element<uint64_t>(idx) = field_range.key_begin - data.begin();
       keys_info->column(1).element<uint16_t>(idx) = len;
       keys_info->column(2).element<uint32_t>(idx) =
-        MurmurHash3_32<cudf::string_view>{}(cudf::string_view(field_range.key_begin, len));
+        cudf::detail::MurmurHash3_32<cudf::string_view>{}(
+          cudf::string_view(field_range.key_begin, len));
     }
   }
 }

diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
@@ -60,7 +60,10 @@ struct equality_functor {
 template <typename T>
 struct hash_functor {
   column_device_view const& col;
-  __device__ auto operator()(size_type idx) { return MurmurHash3_32<T>{}(col.element<T>(idx)); }
+  __device__ auto operator()(size_type idx) const
+  {
+    return cudf::detail::MurmurHash3_32<T>{}(col.element<T>(idx));
+  }
 };
 
 struct map_insert_fn {

diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
@@ -779,10 +779,10 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
         if (!is_numeric(input.column(column_id).type()))
           CUDF_FAIL("IdentityHash does not support this data type");
       }
-      return detail::local::hash_partition<IdentityHash>(
+      return detail::local::hash_partition<detail::IdentityHash>(
         input, columns_to_hash, num_partitions, seed, stream, mr);
     case (hash_id::HASH_MURMUR3):
-      return detail::local::hash_partition<MurmurHash3_32>(
+      return detail::local::hash_partition<detail::MurmurHash3_32>(
         input, columns_to_hash, num_partitions, seed, stream, mr);
     default: CUDF_FAIL("Unsupported hash function in hash_partition");
   }

diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -22,6 +22,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/strings/detail/combine.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
@@ -144,8 +145,8 @@ struct byte_pair_encoding_fn {
    * @param rhs Second string.
    * @return The hash value to match with `d_map`.
    */
-  __device__ hash_value_type compute_hash(cudf::string_view const& lhs,
-                                          cudf::string_view const& rhs)
+  __device__ cudf::hash_value_type compute_hash(cudf::string_view const& lhs,
+                                                cudf::string_view const& rhs)
   {
     __shared__ char shmem[48 * 1024];  // max for Pascal
     auto const total_size         = lhs.size_bytes() + rhs.size_bytes() + 1;

diff --git a/cpp/src/text/subword/bpe_tokenizer.cuh b/cpp/src/text/subword/bpe_tokenizer.cuh
@@ -36,12 +36,12 @@ namespace detail {
 
 using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
 
-using merge_pairs_map_type = cuco::static_map<hash_value_type,
+using merge_pairs_map_type = cuco::static_map<cudf::hash_value_type,
                                               cudf::size_type,
                                               cuda::thread_scope_device,
                                               hash_table_allocator_type>;
 
-using string_hasher_type = MurmurHash3_32<cudf::string_view>;
+using string_hasher_type = cudf::detail::MurmurHash3_32<cudf::string_view>;
 
 }  // namespace detail
 

diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
@@ -21,6 +21,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/hash_functions.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -42,7 +43,7 @@ struct make_pair_function {
   /**
    * @brief Hash the merge pair entry
    */
-  __device__ cuco::pair_type<hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
+  __device__ cuco::pair_type<cudf::hash_value_type, cudf::size_type> operator()(cudf::size_type idx)
   {
     auto const result = _hasher(d_strings.element<cudf::string_view>(idx));
     return cuco::make_pair(result, idx);
@@ -105,9 +106,9 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
   // Ensure capacity is at least (size/0.7) as documented here:
   // https://github.com/NVIDIA/cuCollections/blob/6ec8b6dcdeceea07ab4456d32461a05c18864411/include/cuco/static_map.cuh#L179-L182
   auto merge_pairs_map = std::make_unique<merge_pairs_map_type>(
-    static_cast<size_t>(input.size() * 2),        // capacity is 2x;
-    std::numeric_limits<hash_value_type>::max(),  // empty key;
-    -1,                                           // empty value is not used
+    static_cast<size_t>(input.size() * 2),              // capacity is 2x;
+    std::numeric_limits<cudf::hash_value_type>::max(),  // empty key;
+    -1,                                                 // empty value is not used
     hash_table_allocator_type{default_allocator<char>{}, stream},
     stream.value());
 
@@ -117,8 +118,8 @@ std::unique_ptr<detail::merge_pairs_map_type> initialize_merge_pairs_map(
 
   merge_pairs_map->insert(iter,
                           iter + input.size(),
-                          cuco::detail::MurmurHash3_32<hash_value_type>{},
-                          thrust::equal_to<hash_value_type>{},
+                          cuco::detail::MurmurHash3_32<cudf::hash_value_type>{},
+                          thrust::equal_to<cudf::hash_value_type>{},
                           stream.value());
 
   return merge_pairs_map;