Skip to content

Commit

Permalink
Add cudf python groupby.diff (#9446)
Browse files Browse the repository at this point in the history
Fixes #5079
- [x] add cudf python groupby diff (implemented using groupby.shift)
- [x] add unit tests

Authors:
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Ashwin Srinath (https://github.com/shwina)

URL: #9446
  • Loading branch information
karthikeyann authored Oct 21, 2021
1 parent e4e4870 commit 23de7d0
Show file tree
Hide file tree
Showing 3 changed files with 116 additions and 0 deletions.
1 change: 1 addition & 0 deletions docs/cudf/source/api_docs/groupby.rst
Original file line number Diff line number Diff line change
Expand Up @@ -77,6 +77,7 @@ application to columns of a specific data type.
DataFrameGroupBy.cummin
DataFrameGroupBy.cumsum
DataFrameGroupBy.describe
DataFrameGroupBy.diff
DataFrameGroupBy.ffill
DataFrameGroupBy.fillna
DataFrameGroupBy.idxmax
Expand Down
32 changes: 32 additions & 0 deletions python/cudf/cudf/core/groupby/groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -860,6 +860,38 @@ def last(self):
"""Get the last non-null value in each group."""
return self.agg("last")

def diff(self, periods=1, axis=0):
"""Get the difference between the values in each group.
Parameters
----------
periods : int, default 1
Periods to shift for calculating difference,
accepts negative values.
axis : {0 or 'index', 1 or 'columns'}, default 0
Take difference over rows (0) or columns (1).
Only row-wise (0) shift is supported.
Returns
-------
Series or DataFrame
First differences of the Series or DataFrame.
"""

if not axis == 0:
raise NotImplementedError("Only axis=0 is supported.")

# grouped values
value_columns = self.grouping.values
_, (data, index), _ = self._groupby.groups(
cudf.core.frame.Frame(value_columns._data)
)
grouped = self.obj.__class__._from_data(data, index)
grouped = self._mimic_pandas_order(grouped)

result = grouped - self.shift(periods=periods)
return result._copy_type_metadata(value_columns)

def _scan_fill(self, method: str, limit: int) -> DataFrameOrSeries:
"""Internal implementation for `ffill` and `bfill`
"""
Expand Down
83 changes: 83 additions & 0 deletions python/cudf/cudf/tests/test_groupby.py
Original file line number Diff line number Diff line change
Expand Up @@ -1913,6 +1913,89 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value):
)


@pytest.mark.parametrize("nelem", [2, 3, 100, 1000])
@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5])
@pytest.mark.parametrize("direction", [1, -1])
def test_groupby_diff_row(nelem, shift_perc, direction):
pdf = make_frame(pd.DataFrame, nelem=nelem, extra_vals=["val2"])
gdf = cudf.from_pandas(pdf)
n_shift = int(nelem * shift_perc) * direction

expected = pdf.groupby(["x", "y"]).diff(periods=n_shift)
got = gdf.groupby(["x", "y"]).diff(periods=n_shift)

assert_groupby_results_equal(
expected[["val", "val2"]], got[["val", "val2"]]
)


@pytest.mark.parametrize("nelem", [10, 50, 100, 1000])
@pytest.mark.parametrize("shift_perc", [0.5, 1.0, 1.5])
@pytest.mark.parametrize("direction", [1, -1])
def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction):
t = rand_dataframe(
dtypes_meta=[
{"dtype": "int64", "null_frequency": 0, "cardinality": 10},
{"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
{"dtype": "float32", "null_frequency": 0.4, "cardinality": 10},
{"dtype": "decimal64", "null_frequency": 0.4, "cardinality": 10},
{
"dtype": "datetime64[ns]",
"null_frequency": 0.4,
"cardinality": 10,
},
{
"dtype": "timedelta64[ns]",
"null_frequency": 0.4,
"cardinality": 10,
},
],
rows=nelem,
use_threads=False,
)
pdf = t.to_pandas()
gdf = cudf.from_pandas(pdf)
n_shift = int(nelem * shift_perc) * direction

expected = pdf.groupby(["0"]).diff(periods=n_shift)
got = gdf.groupby(["0"]).diff(periods=n_shift)

assert_groupby_results_equal(
expected[["1", "2", "3", "4", "5"]], got[["1", "2", "3", "4", "5"]]
)


@pytest.mark.parametrize("nelem", [10, 50, 100, 1000])
def test_groupby_diff_row_zero_shift(nelem):
t = rand_dataframe(
dtypes_meta=[
{"dtype": "int64", "null_frequency": 0, "cardinality": 10},
{"dtype": "int64", "null_frequency": 0.4, "cardinality": 10},
{"dtype": "float32", "null_frequency": 0.4, "cardinality": 10},
{
"dtype": "datetime64[ns]",
"null_frequency": 0.4,
"cardinality": 10,
},
{
"dtype": "timedelta64[ns]",
"null_frequency": 0.4,
"cardinality": 10,
},
],
rows=nelem,
use_threads=False,
)
gdf = cudf.from_pandas(t.to_pandas())

expected = gdf
got = gdf.groupby(["0"]).shift(periods=0)

assert_groupby_results_equal(
expected[["1", "2", "3", "4"]], got[["1", "2", "3", "4"]]
)


# TODO: test for category columns when cudf.Scalar supports category type
@pytest.mark.parametrize("nelem", [10, 100, 1000])
def test_groupby_fillna_multi_value(nelem):
Expand Down

0 comments on commit 23de7d0

Please sign in to comment.