From 1935a8a9de87152e70b7930c911e0a44da0560cc Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 31 Aug 2021 18:07:15 -0500 Subject: [PATCH] Add support for `get_group` in GroupBy (#9070) This PR adds `get_group` functionality to `GroupBy`. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: https://github.com/rapidsai/cudf/pull/9070 --- python/cudf/cudf/core/groupby/groupby.py | 37 ++++++++++++++++++ python/cudf/cudf/tests/test_groupby.py | 49 ++++++++++++++++++++++++ 2 files changed, 86 insertions(+) diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py index fd425d9de76..d98a78efb18 100644 --- a/python/cudf/cudf/core/groupby/groupby.py +++ b/python/cudf/cudf/core/groupby/groupby.py @@ -103,6 +103,43 @@ def groups(self): zip(group_names.to_pandas(), grouped_index._split(offsets[1:-1])) ) + def get_group(self, name, obj=None): + """ + Construct DataFrame from group with provided name. + + Parameters + ---------- + name : object + The name of the group to get as a DataFrame. + obj : DataFrame, default None + The DataFrame to take the DataFrame out of. If + it is None, the object groupby was called on will + be used. + + Returns + ------- + group : same type as obj + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}) + >>> df + X Y + 0 A 1 + 1 B 4 + 2 A 3 + 3 B 2 + >>> df.groupby("X").get_group("A") + X Y + 0 A 1 + 2 A 3 + """ + if obj is None: + obj = self.obj + + return obj.loc[self.groups[name]] + def size(self): """ Return the size of each group. diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index df6a9336e97..7719df492f7 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2114,3 +2114,52 @@ def foo(x): expect = make_frame(pd.DataFrame, 100).groupby("x").y.apply(foo) assert_groupby_results_equal(expect, got) + + +@pytest.mark.parametrize( + "pdf, group, name, obj", + [ + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "X", + "A", + None, + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "X", + "B", + None, + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "X", + "A", + pd.DataFrame({"a": [1, 2, 4, 5, 10, 11]}), + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "Y", + 1, + pd.DataFrame({"a": [1, 2, 4, 5, 10, 11]}), + ), + ( + pd.DataFrame({"X": ["A", "B", "A", "B"], "Y": [1, 4, 3, 2]}), + "Y", + 3, + pd.DataFrame({"a": [1, 2, 0, 11]}), + ), + ], +) +def test_groupby_get_group(pdf, group, name, obj): + gdf = cudf.from_pandas(pdf) + + if isinstance(obj, pd.DataFrame): + gobj = cudf.from_pandas(obj) + else: + gobj = obj + + expected = pdf.groupby(group).get_group(name=name, obj=obj) + actual = gdf.groupby(group).get_group(name=name, obj=gobj) + + assert_groupby_results_equal(expected, actual)