From 488c7ad57163e44961012bd34696c9a5c77ee8ae Mon Sep 17 00:00:00 2001 From: brandon-b-miller <53796099+brandon-b-miller@users.noreply.github.com> Date: Fri, 2 Sep 2022 11:47:37 -0500 Subject: [PATCH] Preserve order if necessary when deduping categoricals internally (#11597) Closes https://github.com/rapidsai/cudf/issues/11486 Authors: - https://github.com/brandon-b-miller Approvers: - Ashwin Srinath (https://github.com/shwina) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/11597 --- python/cudf/cudf/core/column/categorical.py | 14 +++++++------- python/cudf/cudf/core/column/column.py | 11 ++++++++++- python/cudf/cudf/core/dataframe.py | 8 +++----- python/cudf/cudf/tests/test_concat.py | 16 ++++++++++++++++ python/cudf/cudf/tests/test_series.py | 9 +++++++++ 5 files changed, 45 insertions(+), 13 deletions(-) diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py index 3211bfae94c..d438f47e1c4 100644 --- a/python/cudf/cudf/core/column/categorical.py +++ b/python/cudf/cudf/core/column/categorical.py @@ -945,8 +945,8 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase": def data_array_view(self) -> cuda.devicearray.DeviceNDArray: return self.codes.data_array_view - def unique(self) -> CategoricalColumn: - codes = self.as_numerical.unique() + def unique(self, preserve_order=False) -> CategoricalColumn: + codes = self.as_numerical.unique(preserve_order=preserve_order) return column.build_categorical_column( categories=self.categories, codes=column.build_column(codes.base_data, dtype=codes.dtype), @@ -1316,7 +1316,9 @@ def _concat( head = next((obj for obj in objs if obj.valid_count), objs[0]) # Combine and de-dupe the categories - cats = column.concat_columns([o.categories for o in objs]).unique() + cats = column.concat_columns([o.categories for o in objs]).unique( + preserve_order=True + ) objs = [o._set_categories(cats, is_unique=True) for o in objs] codes = [o.codes for o in objs] @@ -1456,10 +1458,8 @@ def _set_categories( # Ensure new_categories is unique first if not (is_unique or new_cats.is_unique): # drop_duplicates() instead of unique() to preserve order - new_cats = ( - cudf.Series(new_cats) - .drop_duplicates(ignore_index=True) - ._column + new_cats = cudf.Series(new_cats)._column.unique( + preserve_order=True ) cur_codes = self.codes diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index f0e38ed5c01..1be0190c94f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -1028,7 +1028,7 @@ def searchsorted( values, side, ascending=ascending, na_position=na_position ) - def unique(self) -> ColumnBase: + def unique(self, preserve_order=False) -> ColumnBase: """ Get unique values in the data """ @@ -1037,6 +1037,15 @@ def unique(self) -> ColumnBase: # Few things to note before we can do this optimization is # the following issue resolved: # https://github.com/rapidsai/cudf/issues/5286 + if preserve_order: + ind = as_column(cupy.arange(0, len(self))) + + # dedup based on the column of data only + ind, col = drop_duplicates([ind, self], keys=[1]) + + # sort col based on ind + map = ind.argsort() + return col.take(map) return drop_duplicates([self], keep="first")[0] diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7a8745371c1..97f06efe642 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7244,11 +7244,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes): isinstance(col, cudf.core.column.CategoricalColumn) for col in cols ): # Combine and de-dupe the categories - categories[idx] = ( - cudf.Series(concat_columns([col.categories for col in cols])) - .drop_duplicates(ignore_index=True) - ._column - ) + categories[idx] = cudf.Series( + concat_columns([col.categories for col in cols]) + )._column.unique(preserve_order=True) # Set the column dtype to the codes' dtype. The categories # will be re-assigned at the end dtypes[idx] = min_scalar_type(len(categories[idx])) diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 5094d938ea1..bf1e9de5d1a 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -1761,3 +1761,19 @@ def test_concat_decimal_non_numeric(s1, s2, expected): def test_concat_struct_column(s1, s2, expected): s = gd.concat([s1, s2]) assert_eq(s, expected, check_index_type=True) + + +def test_concat_categorical_ordering(): + # https://github.com/rapidsai/cudf/issues/11486 + sr = pd.Series( + ["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category" + ) + sr = sr.cat.set_categories(["d", "a", "b", "c", "e"]) + + df = pd.DataFrame({"a": sr}) + gdf = gd.from_pandas(df) + + expect = pd.concat([df, df, df]) + got = gd.concat([gdf, gdf, gdf]) + + assert_eq(expect, got) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 6de27980ec2..b1ecb38e4d4 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1928,3 +1928,12 @@ def test_default_integer_bitwidth_construction(default_integer_bitwidth, data): def test_default_float_bitwidth_construction(default_float_bitwidth, data): s = cudf.Series(data) assert s.dtype == np.dtype(f"f{default_float_bitwidth//8}") + + +def test_series_ordered_dedup(): + # part of https://github.com/rapidsai/cudf/issues/11486 + sr = cudf.Series(np.random.randint(0, 100, 1000)) + # pandas unique() preserves order + expect = pd.Series(sr.to_pandas().unique()) + got = cudf.Series(sr._column.unique(preserve_order=True)) + assert_eq(expect.values, got.values)