diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index c686cd0fd39..f9fef7dc4dc 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6145,6 +6145,37 @@ def __dataframe__( self, nan_as_null=nan_as_null, allow_copy=allow_copy ) + def nunique(self, axis=0, dropna=True): + """ + Count number of distinct elements in specified axis. + Return Series with number of distinct elements. Can ignore NaN values. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'}, default 0 + The axis to use. 0 or 'index' for row-wise, 1 or 'columns' for column-wise. + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'A': [4, 5, 6], 'B': [4, 1, 1]}) + >>> df.nunique() + A 3 + B 2 + dtype: int64 + """ + if axis != 0: + raise NotImplementedError( + "axis parameter is not supported yet." + ) + + return cudf.Series(super().nunique(method="sort", dropna=dropna)) def from_dataframe(df, allow_copy=False): return df_protocol.from_dataframe(df, allow_copy=allow_copy) diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 1d59d9f3b1a..6142f0f0f40 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -6420,6 +6420,27 @@ def ge(self, other, axis="columns", level=None, fill_value=None): other=other, fn="ge", fill_value=fill_value, can_reindex=True ) + def nunique(self, method: builtins.str = "sort", dropna: bool = True): + """ + Returns a per column mapping with counts of unique values for + each column. + + Parameters + ---------- + method : builtins.str, default "sort" + Method used by cpp_distinct_count + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + dict + Name and unique value counts of each column in frame. + """ + return { + name: col.distinct_count(method=method, dropna=dropna) + for name, col in self._data.items() + } def _get_replacement_values_for_columns( to_replace: Any, value: Any, columns_dtype_map: Dict[Any, Any] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index e96531d4b1c..0032dc25cee 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2947,7 +2947,7 @@ def nunique(self, method="sort", dropna=True): raise NotImplementedError(msg) if self.null_count == len(self): return 0 - return self._column.distinct_count(method, dropna) + return super().nunique(method, dropna) def value_counts( self, diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index 7793a2fdf29..85a85b1930f 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -336,3 +336,21 @@ def _make_operands_for_binop( return NotImplemented return {result_name: (self._column, other, reflect, fill_value)} + + def nunique(self, method: builtins.str = "sort", dropna: bool = True): + """ + Returns count of unique values for the column. + + Parameters + ---------- + method : builtins.str, default "sort" + Method used by cpp_distinct_count + dropna : bool, default True + Don't include NaN in the counts. + + Returns + ------- + int + Number of unique values in the column. + """ + return sum(super().nunique(method=method, dropna=dropna).values()) \ No newline at end of file diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 3e359335719..d81451e9d54 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -9077,3 +9077,27 @@ def test_dataframe_assign_cp_np_array(): gdf[[f"f_{i}" for i in range(n)]] = cp_ndarray assert_eq(pdf, gdf) + +@pytest.mark.parametrize( + "data", [{"a": [1, 2, 3], "b": [1, 1, 0]}], +) +def test_dataframe_nunique(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.nunique() + expected = pdf.nunique() + + assert_eq(expected, actual) + +@pytest.mark.parametrize( + "data", [{ "key": [0, 1, 1, 0, 0, 1], "val": [1, 8, 3, 9, -3, 8]}], +) +def test_dataframe_nunique_index(data): + gdf = cudf.DataFrame(data) + pdf = gdf.to_pandas() + + actual = gdf.index.nunique() + expected = pdf.index.nunique() + + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index ffdd53c58ac..ee891828b3c 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1517,3 +1517,27 @@ def test_series_transpose(data): assert_eq(pd_transposed, cudf_transposed) assert_eq(pd_property, cudf_property) assert_eq(cudf_transposed, csr) + +@pytest.mark.parametrize( + "data", [1, 3, 5, 7, 7], +) +def test_series_nunique(data): + cd_s = cudf.Series(data) + pd_s = cd_s.to_pandas() + + actual = cd_s.nunique() + expected = pd_s.nunique() + + assert_eq(expected, actual) + +@pytest.mark.parametrize( + "data", [1, 3, 5, 7, 7], +) +def test_series_nunique_index(data): + cd_s = cudf.Series(data) + pd_s = cd_s.to_pandas() + + actual = cd_s.index.nunique() + expected = pd_s.index.nunique() + + assert_eq(expected, actual) \ No newline at end of file