Preserve order if necessary when deduping categoricals internally (NV…

…IDIA#11597) Closes rapidsai/cudf#11486 Authors: - https://github.com/brandon-b-miller Approvers: - Ashwin Srinath (https://github.com/shwina) - Matthew Roeschke (https://github.com/mroeschke) URL: rapidsai/cudf#11597
tgravescs · Sep 2, 2022 · 488c7ad · 488c7ad
1 parent dc0d8d1
commit 488c7ad
Show file tree

Hide file tree

Showing 5 changed files with 45 additions and 13 deletions.
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -945,8 +945,8 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase":
     def data_array_view(self) -> cuda.devicearray.DeviceNDArray:
         return self.codes.data_array_view
 
-    def unique(self) -> CategoricalColumn:
-        codes = self.as_numerical.unique()
+    def unique(self, preserve_order=False) -> CategoricalColumn:
+        codes = self.as_numerical.unique(preserve_order=preserve_order)
         return column.build_categorical_column(
             categories=self.categories,
             codes=column.build_column(codes.base_data, dtype=codes.dtype),
@@ -1316,7 +1316,9 @@ def _concat(
         head = next((obj for obj in objs if obj.valid_count), objs[0])
 
         # Combine and de-dupe the categories
-        cats = column.concat_columns([o.categories for o in objs]).unique()
+        cats = column.concat_columns([o.categories for o in objs]).unique(
+            preserve_order=True
+        )
         objs = [o._set_categories(cats, is_unique=True) for o in objs]
         codes = [o.codes for o in objs]
 
@@ -1456,10 +1458,8 @@ def _set_categories(
         # Ensure new_categories is unique first
         if not (is_unique or new_cats.is_unique):
             # drop_duplicates() instead of unique() to preserve order
-            new_cats = (
-                cudf.Series(new_cats)
-                .drop_duplicates(ignore_index=True)
-                ._column
+            new_cats = cudf.Series(new_cats)._column.unique(
+                preserve_order=True
             )
 
         cur_codes = self.codes

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -1028,7 +1028,7 @@ def searchsorted(
             values, side, ascending=ascending, na_position=na_position
         )
 
-    def unique(self) -> ColumnBase:
+    def unique(self, preserve_order=False) -> ColumnBase:
         """
         Get unique values in the data
         """
@@ -1037,6 +1037,15 @@ def unique(self) -> ColumnBase:
         # Few things to note before we can do this optimization is
         # the following issue resolved:
         # https://github.com/rapidsai/cudf/issues/5286
+        if preserve_order:
+            ind = as_column(cupy.arange(0, len(self)))
+
+            # dedup based on the column of data only
+            ind, col = drop_duplicates([ind, self], keys=[1])
+
+            # sort col based on ind
+            map = ind.argsort()
+            return col.take(map)
 
         return drop_duplicates([self], keep="first")[0]
 

diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -7244,11 +7244,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
             isinstance(col, cudf.core.column.CategoricalColumn) for col in cols
         ):
             # Combine and de-dupe the categories
-            categories[idx] = (
-                cudf.Series(concat_columns([col.categories for col in cols]))
-                .drop_duplicates(ignore_index=True)
-                ._column
-            )
+            categories[idx] = cudf.Series(
+                concat_columns([col.categories for col in cols])
+            )._column.unique(preserve_order=True)
             # Set the column dtype to the codes' dtype. The categories
             # will be re-assigned at the end
             dtypes[idx] = min_scalar_type(len(categories[idx]))

diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py
@@ -1761,3 +1761,19 @@ def test_concat_decimal_non_numeric(s1, s2, expected):
 def test_concat_struct_column(s1, s2, expected):
     s = gd.concat([s1, s2])
     assert_eq(s, expected, check_index_type=True)
+
+
+def test_concat_categorical_ordering():
+    # https://github.com/rapidsai/cudf/issues/11486
+    sr = pd.Series(
+        ["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category"
+    )
+    sr = sr.cat.set_categories(["d", "a", "b", "c", "e"])
+
+    df = pd.DataFrame({"a": sr})
+    gdf = gd.from_pandas(df)
+
+    expect = pd.concat([df, df, df])
+    got = gd.concat([gdf, gdf, gdf])
+
+    assert_eq(expect, got)
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
@@ -1928,3 +1928,12 @@ def test_default_integer_bitwidth_construction(default_integer_bitwidth, data):
 def test_default_float_bitwidth_construction(default_float_bitwidth, data):
     s = cudf.Series(data)
     assert s.dtype == np.dtype(f"f{default_float_bitwidth//8}")
+
+
+def test_series_ordered_dedup():
+    # part of https://github.com/rapidsai/cudf/issues/11486
+    sr = cudf.Series(np.random.randint(0, 100, 1000))
+    # pandas unique() preserves order
+    expect = pd.Series(sr.to_pandas().unique())
+    got = cudf.Series(sr._column.unique(preserve_order=True))
+    assert_eq(expect.values, got.values)