rapidsai · rapids-bot · Oct 12, 2021 · Oct 4, 2021 · Oct 4, 2021 · Oct 6, 2021
@@ -54,17 +54,23 @@ def hash_partition(source_table, object columns_to_hash,
     )
 
 
-def hash(source_table, object initial_hash_values=None, int seed=0):
-    cdef vector[uint32_t] c_initial_hash = initial_hash_values or []
+def hash(source_table, str method, object initial_hash=None, int seed=0):
+    cdef vector[uint32_t] c_initial_hash = initial_hash or []
     cdef table_view c_source_view = table_view_from_table(
         source_table, ignore_index=True)
-
     cdef unique_ptr[column] c_result
+    cdef libcudf_types.hash_id c_hash_function
+    if method == "murmur3":
+        c_hash_function = libcudf_types.hash_id.HASH_MURMUR3
+    elif method == "md5":
+        c_hash_function = libcudf_types.hash_id.HASH_MD5
+    else:
+        raise ValueError(f"Unsupported hash function: {method}")
     with nogil:
         c_result = move(
             cpp_hash(
                 c_source_view,
-                libcudf_types.hash_id.HASH_MURMUR3,
+                c_hash_function,
                 c_initial_hash,
                 seed
             )

@@ -5003,22 +5003,33 @@ def apply_chunks(
             tpb=tpb,
         )
 
-    def hash_columns(self, columns=None):
+    def hash_columns(self, columns=None, method="murmur3"):
         """Hash the given *columns* and return a new device array
 
         Parameters
         ----------
         columns : sequence of str; optional
             Sequence of column names. If columns is *None* (unspecified),
             all columns in the frame are used.
+        method : {'murmur3', 'md5'}, default 'murmur3'
+            Hash function to use:
+            * murmur3: MurmurHash3 hash function.
+            * md5: MD5 hash function.
+
+        Returns
+        -------
+        Series
+            Hash values for each row.
         """
-        if columns is None:
-            table_to_hash = self
-        else:
-            cols = [self[k]._column for k in columns]
-            table_to_hash = Frame(data=dict(zip(columns, cols)))
+        table_to_hash = (
+            self
+            if columns is None
+            else Frame(data={k: self._data[k] for k in columns})
+        )
 
-        return Series(table_to_hash._hash()).values
+        return Series._from_data(
+            {None: table_to_hash._hash(method=method)}, index=self.index
+        )
 
     def partition_by_hash(self, columns, nparts, keep_index=True):
         """Partition the dataframe by the hashed value of data in *columns*.

@@ -623,8 +623,8 @@ def _gather(self, gather_map, keep_index=True, nullify=False):
             result._index.names = self._index.names
         return result
 
-    def _hash(self, initial_hash_values=None):
-        return libcudf.hash.hash(self, initial_hash_values)
+    def _hash(self, method, initial_hash=None):
+        return libcudf.hash.hash(self, method, initial_hash)
 
     def _hash_partition(
         self, columns_to_hash, num_partitions, keep_index=True

@@ -4095,13 +4095,20 @@ def floor(self):
         """
         return self._unaryop("floor")
 
-    def hash_values(self):
+    def hash_values(self, method="murmur3"):
         """Compute the hash of values in this column.
 
+        Parameters
+        ----------
+        method : {'murmur3', 'md5'}, default 'murmur3'
+            Hash function to use:
+            * murmur3: MurmurHash3 hash function.
+            * md5: MD5 hash function.
+
         Returns
         -------
-        cupy array
-            A cupy array with hash values.
+        Series
+            A Series with hash values.
 
         Examples
         --------
@@ -4112,10 +4119,12 @@ def hash_values(self):
         1    120
         2     30
         dtype: int64
-        >>> series.hash_values()
+        >>> series.hash_values(method="murmur3")
         array([-1930516747,   422619251,  -941520876], dtype=int32)
         """
-        return Series(self._hash()).values
+        return Series._from_data(
+            {None: self._hash(method=method)}, index=self.index
+        )
 
     def hash_encode(self, stop, use_name=False):
         """Encode column values as ints in [0, stop) using hash function.
@@ -4158,13 +4167,19 @@ def hash_encode(self, stop, use_name=False):
             raise ValueError("stop must be a positive integer.")
 
         initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None
-        hashed_values = Series(self._hash(initial_hash))
+        hashed_values = Series._from_data(
+            {
+                self.name: self._hash(
+                    method="murmur3", initial_hash=initial_hash
+                )
+            },
+            self.index,
+        )
 
         if hashed_values.has_nulls:
             raise ValueError("Column must have no nulls.")
 
-        mod_vals = hashed_values % stop
-        return Series(mod_vals._column, index=self.index, name=self.name)
+        return hashed_values % stop
 
     def quantile(
         self, q=0.5, interpolation="linear", exact=True, quant_index=True

@@ -1103,27 +1103,28 @@ def test_assign():
 
 
 @pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
-def test_dataframe_hash_columns(nrows):
+@pytest.mark.parametrize("method", ["murmur3", "md5"])
+def test_dataframe_hash_columns(nrows, method):
     gdf = cudf.DataFrame()
     data = np.asarray(range(nrows))
     data[0] = data[-1]  # make first and last the same
     gdf["a"] = data
     gdf["b"] = gdf.a + 100
     out = gdf.hash_columns(["a", "b"])
-    assert isinstance(out, cupy.ndarray)
+    assert isinstance(out, cudf.Series)
     assert len(out) == nrows
     assert out.dtype == np.int32
 
     # Check default
     out_all = gdf.hash_columns()
-    np.testing.assert_array_equal(cupy.asnumpy(out), cupy.asnumpy(out_all))
+    assert_eq(out, out_all)
 
     # Check single column
-    out_one = cupy.asnumpy(gdf.hash_columns(["a"]))
+    out_one = gdf.hash_columns(["a"], method=method)
     # First matches last
-    assert out_one[0] == out_one[-1]
+    assert out_one.iloc[0] == out_one.iloc[-1]
     # Equivalent to the cudf.Series.hash_values()
-    np.testing.assert_array_equal(cupy.asnumpy(gdf.a.hash_values()), out_one)
+    assert_eq(gdf["a"].hash_values(method=method), out_one)
 
 
 @pytest.mark.parametrize("nrows", [3, 10, 100, 1000])

@@ -1272,3 +1272,47 @@ def test_series_sort_index(
         assert_eq(ps, gs, check_index_type=True)
     else:
         assert_eq(expected, got, check_index_type=True)
+
+
+@pytest.mark.parametrize(
+    "method,validation_data",
+    [
+        (
+            "md5",
+            [
+                "d41d8cd98f00b204e9800998ecf8427e",
+                "cfcd208495d565ef66e7dff9f98764da",
+                "3d3aaae21d57b101227f0384f644abe0",
+                "3e76c7023d771ad1c1520c27ab3d4874",
+                "f8d805e33ec3ade1a6ea251ac1c118e7",
+                "c30515f66a5aec7af7666abf33600c92",
+                "c61a4185135eda043f35e92c3505e180",
+                "52da74c75cb6575d25be29e66bd0adde",
+                "5152ac13bdd09110d9ee9c169a3d9237",
+                "f1d3ff8443297732862df21dc4e57262",
+            ],
+        )
+    ],
+)
+def test_series_hash_values(method, validation_data):
+    inputs = cudf.Series(
+        [
+            "",
+            "0",
+            "A 56 character string to test message padding algorithm.",
+            "A 63 character string to test message padding algorithm, again.",
+            "A 64 character string to test message padding algorithm, again!!",
+            (
+                "A very long (greater than 128 bytes/char string) to execute "
+                "a multi hash-step data point in the hash function being "
+                "tested. This string needed to be longer."
+            ),
+            "All work and no play makes Jack a dull boy",
+            "!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
+            "\x00\x00\x00\x10\x00\x00\x00\x00",
+            "\x00\x00\x00\x00",
+        ]
+    )
+    validation_results = cudf.Series(validation_data)
+    hash_values = inputs.hash_values(method=method)
+    assert_eq(hash_values, validation_results)