Skip to content

Commit

Permalink
Deprecate Series.hash_encode. (#9457)
Browse files Browse the repository at this point in the history
Resolves #9381 by deprecating `Series.hash_encode`. See issue for details.

Authors:
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - https://github.com/brandon-b-miller
  - Michael Wang (https://github.com/isVoid)

URL: #9457
  • Loading branch information
bdice authored Oct 19, 2021
1 parent 5e2aaf9 commit 4e04334
Show file tree
Hide file tree
Showing 2 changed files with 20 additions and 4 deletions.
10 changes: 10 additions & 0 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3510,6 +3510,9 @@ def hash_values(self, method="murmur3"):
def hash_encode(self, stop, use_name=False):
"""Encode column values as ints in [0, stop) using hash function.
This method is deprecated. Replace ``series.hash_encode(stop,
use_name=False)`` with ``series.hash_values(method="murmur3") % stop``.
Parameters
----------
stop : int
Expand Down Expand Up @@ -3544,6 +3547,13 @@ def hash_encode(self, stop, use_name=False):
2 76
dtype: int32
"""
warnings.warn(
"The `hash_encode` method will be removed in a future cuDF "
"release. Replace `series.hash_encode(stop, use_name=False)` "
'with `series.hash_values(method="murmur3") % stop`.',
FutureWarning,
)

if not stop > 0:
raise ValueError("stop must be a positive integer.")

Expand Down
14 changes: 10 additions & 4 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2218,24 +2218,30 @@ def test_series_hash_encode(nrows):
s = cudf.Series(data, name=1)
num_features = 1000

encoded_series = s.hash_encode(num_features)
with pytest.warns(FutureWarning):
encoded_series = s.hash_encode(num_features)
assert isinstance(encoded_series, cudf.Series)
enc_arr = encoded_series.to_numpy()
assert np.all(enc_arr >= 0)
assert np.max(enc_arr) < num_features

enc_with_name_arr = s.hash_encode(num_features, use_name=True).to_numpy()
with pytest.warns(FutureWarning):
enc_with_name_arr = s.hash_encode(
num_features, use_name=True
).to_numpy()
assert enc_with_name_arr[0] != enc_arr[0]


def test_series_hash_encode_reproducible_results():
# Regression test to ensure that hash_encode outputs are reproducible
data = cudf.Series([0, 1, 2])
hash_result = data.hash_encode(stop=2 ** 16, use_name=False)
with pytest.warns(FutureWarning):
hash_result = data.hash_encode(stop=2 ** 16, use_name=False)
expected_result = cudf.Series([42165, 55037, 7341])
assert_eq(hash_result, expected_result)

hash_result_with_name = data.hash_encode(stop=2 ** 16, use_name=True)
with pytest.warns(FutureWarning):
hash_result_with_name = data.hash_encode(stop=2 ** 16, use_name=True)
expected_result_with_name = cudf.Series([36137, 39649, 58673])
assert_eq(hash_result_with_name, expected_result_with_name)

Expand Down

0 comments on commit 4e04334

Please sign in to comment.