Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

MD5 Python hash API #9390

Merged
merged 12 commits into from
Oct 12, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 10 additions & 4 deletions python/cudf/cudf/_lib/hash.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -54,17 +54,23 @@ def hash_partition(source_table, object columns_to_hash,
)


def hash(source_table, object initial_hash_values=None, int seed=0):
cdef vector[uint32_t] c_initial_hash = initial_hash_values or []
def hash(source_table, str method, object initial_hash=None, int seed=0):
cdef vector[uint32_t] c_initial_hash = initial_hash or []
cdef table_view c_source_view = table_view_from_table(
source_table, ignore_index=True)

cdef unique_ptr[column] c_result
cdef libcudf_types.hash_id c_hash_function
if method == "murmur3":
c_hash_function = libcudf_types.hash_id.HASH_MURMUR3
elif method == "md5":
c_hash_function = libcudf_types.hash_id.HASH_MD5
else:
raise ValueError(f"Unsupported hash function: {method}")
with nogil:
c_result = move(
cpp_hash(
c_source_view,
libcudf_types.hash_id.HASH_MURMUR3,
c_hash_function,
c_initial_hash,
seed
)
Expand Down
25 changes: 18 additions & 7 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5003,22 +5003,33 @@ def apply_chunks(
tpb=tpb,
)

def hash_columns(self, columns=None):
def hash_columns(self, columns=None, method="murmur3"):
"""Hash the given *columns* and return a new device array

Parameters
----------
columns : sequence of str; optional
Sequence of column names. If columns is *None* (unspecified),
all columns in the frame are used.
method : {'murmur3', 'md5'}, default 'murmur3'
Hash function to use:
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.

Returns
-------
Series
Hash values for each row.
"""
if columns is None:
table_to_hash = self
else:
cols = [self[k]._column for k in columns]
table_to_hash = Frame(data=dict(zip(columns, cols)))
table_to_hash = (
self
if columns is None
else Frame(data={k: self._data[k] for k in columns})
)

return Series(table_to_hash._hash()).values
return Series._from_data(
{None: table_to_hash._hash(method=method)}, index=self.index
)

def partition_by_hash(self, columns, nparts, keep_index=True):
"""Partition the dataframe by the hashed value of data in *columns*.
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -623,8 +623,8 @@ def _gather(self, gather_map, keep_index=True, nullify=False):
result._index.names = self._index.names
return result

def _hash(self, initial_hash_values=None):
return libcudf.hash.hash(self, initial_hash_values)
def _hash(self, method, initial_hash=None):
return libcudf.hash.hash(self, method, initial_hash)

def _hash_partition(
self, columns_to_hash, num_partitions, keep_index=True
Expand Down
31 changes: 23 additions & 8 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -4095,13 +4095,20 @@ def floor(self):
"""
return self._unaryop("floor")

def hash_values(self):
def hash_values(self, method="murmur3"):
"""Compute the hash of values in this column.

Parameters
----------
method : {'murmur3', 'md5'}, default 'murmur3'
Hash function to use:
* murmur3: MurmurHash3 hash function.
* md5: MD5 hash function.

Returns
-------
cupy array
A cupy array with hash values.
Series
A Series with hash values.

Examples
--------
Expand All @@ -4112,10 +4119,12 @@ def hash_values(self):
1 120
2 30
dtype: int64
>>> series.hash_values()
>>> series.hash_values(method="murmur3")
array([-1930516747, 422619251, -941520876], dtype=int32)
"""
return Series(self._hash()).values
return Series._from_data(
{None: self._hash(method=method)}, index=self.index
)

def hash_encode(self, stop, use_name=False):
"""Encode column values as ints in [0, stop) using hash function.
Expand Down Expand Up @@ -4158,13 +4167,19 @@ def hash_encode(self, stop, use_name=False):
raise ValueError("stop must be a positive integer.")

initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None
hashed_values = Series(self._hash(initial_hash))
hashed_values = Series._from_data(
{
self.name: self._hash(
method="murmur3", initial_hash=initial_hash
)
},
self.index,
)

if hashed_values.has_nulls:
raise ValueError("Column must have no nulls.")

mod_vals = hashed_values % stop
return Series(mod_vals._column, index=self.index, name=self.name)
return hashed_values % stop

def quantile(
self, q=0.5, interpolation="linear", exact=True, quant_index=True
Expand Down
13 changes: 7 additions & 6 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1103,27 +1103,28 @@ def test_assign():


@pytest.mark.parametrize("nrows", [1, 8, 100, 1000])
def test_dataframe_hash_columns(nrows):
@pytest.mark.parametrize("method", ["murmur3", "md5"])
def test_dataframe_hash_columns(nrows, method):
bdice marked this conversation as resolved.
Show resolved Hide resolved
gdf = cudf.DataFrame()
data = np.asarray(range(nrows))
data[0] = data[-1] # make first and last the same
gdf["a"] = data
gdf["b"] = gdf.a + 100
out = gdf.hash_columns(["a", "b"])
assert isinstance(out, cupy.ndarray)
assert isinstance(out, cudf.Series)
assert len(out) == nrows
assert out.dtype == np.int32

# Check default
out_all = gdf.hash_columns()
np.testing.assert_array_equal(cupy.asnumpy(out), cupy.asnumpy(out_all))
assert_eq(out, out_all)

# Check single column
out_one = cupy.asnumpy(gdf.hash_columns(["a"]))
out_one = gdf.hash_columns(["a"], method=method)
# First matches last
assert out_one[0] == out_one[-1]
assert out_one.iloc[0] == out_one.iloc[-1]
# Equivalent to the cudf.Series.hash_values()
np.testing.assert_array_equal(cupy.asnumpy(gdf.a.hash_values()), out_one)
assert_eq(gdf["a"].hash_values(method=method), out_one)


@pytest.mark.parametrize("nrows", [3, 10, 100, 1000])
Expand Down
44 changes: 44 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1272,3 +1272,47 @@ def test_series_sort_index(
assert_eq(ps, gs, check_index_type=True)
else:
assert_eq(expected, got, check_index_type=True)


@pytest.mark.parametrize(
"method,validation_data",
[
(
"md5",
[
"d41d8cd98f00b204e9800998ecf8427e",
"cfcd208495d565ef66e7dff9f98764da",
"3d3aaae21d57b101227f0384f644abe0",
"3e76c7023d771ad1c1520c27ab3d4874",
"f8d805e33ec3ade1a6ea251ac1c118e7",
"c30515f66a5aec7af7666abf33600c92",
"c61a4185135eda043f35e92c3505e180",
"52da74c75cb6575d25be29e66bd0adde",
"5152ac13bdd09110d9ee9c169a3d9237",
"f1d3ff8443297732862df21dc4e57262",
],
)
],
)
def test_series_hash_values(method, validation_data):
inputs = cudf.Series(
[
"",
"0",
"A 56 character string to test message padding algorithm.",
"A 63 character string to test message padding algorithm, again.",
"A 64 character string to test message padding algorithm, again!!",
(
"A very long (greater than 128 bytes/char string) to execute "
"a multi hash-step data point in the hash function being "
"tested. This string needed to be longer."
),
"All work and no play makes Jack a dull boy",
Copy link
Contributor

@shwina shwina Oct 12, 2021

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

<heres_johnny.gif>

"!\"#$%&'()*+,-./0123456789:;<=>?@[\\]^_`{|}~",
"\x00\x00\x00\x10\x00\x00\x00\x00",
"\x00\x00\x00\x00",
]
)
validation_results = cudf.Series(validation_data)
hash_values = inputs.hash_values(method=method)
assert_eq(hash_values, validation_results)