Skip to content

Commit

Permalink
Preserve order if necessary when deduping categoricals internally (NV…
Browse files Browse the repository at this point in the history
  • Loading branch information
brandon-b-miller authored Sep 2, 2022
1 parent dc0d8d1 commit 488c7ad
Show file tree
Hide file tree
Showing 5 changed files with 45 additions and 13 deletions.
14 changes: 7 additions & 7 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -945,8 +945,8 @@ def clip(self, lo: ScalarLike, hi: ScalarLike) -> "column.ColumnBase":
def data_array_view(self) -> cuda.devicearray.DeviceNDArray:
return self.codes.data_array_view

def unique(self) -> CategoricalColumn:
codes = self.as_numerical.unique()
def unique(self, preserve_order=False) -> CategoricalColumn:
codes = self.as_numerical.unique(preserve_order=preserve_order)
return column.build_categorical_column(
categories=self.categories,
codes=column.build_column(codes.base_data, dtype=codes.dtype),
Expand Down Expand Up @@ -1316,7 +1316,9 @@ def _concat(
head = next((obj for obj in objs if obj.valid_count), objs[0])

# Combine and de-dupe the categories
cats = column.concat_columns([o.categories for o in objs]).unique()
cats = column.concat_columns([o.categories for o in objs]).unique(
preserve_order=True
)
objs = [o._set_categories(cats, is_unique=True) for o in objs]
codes = [o.codes for o in objs]

Expand Down Expand Up @@ -1456,10 +1458,8 @@ def _set_categories(
# Ensure new_categories is unique first
if not (is_unique or new_cats.is_unique):
# drop_duplicates() instead of unique() to preserve order
new_cats = (
cudf.Series(new_cats)
.drop_duplicates(ignore_index=True)
._column
new_cats = cudf.Series(new_cats)._column.unique(
preserve_order=True
)

cur_codes = self.codes
Expand Down
11 changes: 10 additions & 1 deletion python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1028,7 +1028,7 @@ def searchsorted(
values, side, ascending=ascending, na_position=na_position
)

def unique(self) -> ColumnBase:
def unique(self, preserve_order=False) -> ColumnBase:
"""
Get unique values in the data
"""
Expand All @@ -1037,6 +1037,15 @@ def unique(self) -> ColumnBase:
# Few things to note before we can do this optimization is
# the following issue resolved:
# https://github.com/rapidsai/cudf/issues/5286
if preserve_order:
ind = as_column(cupy.arange(0, len(self)))

# dedup based on the column of data only
ind, col = drop_duplicates([ind, self], keys=[1])

# sort col based on ind
map = ind.argsort()
return col.take(map)

return drop_duplicates([self], keep="first")[0]

Expand Down
8 changes: 3 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7244,11 +7244,9 @@ def _find_common_dtypes_and_categories(non_null_columns, dtypes):
isinstance(col, cudf.core.column.CategoricalColumn) for col in cols
):
# Combine and de-dupe the categories
categories[idx] = (
cudf.Series(concat_columns([col.categories for col in cols]))
.drop_duplicates(ignore_index=True)
._column
)
categories[idx] = cudf.Series(
concat_columns([col.categories for col in cols])
)._column.unique(preserve_order=True)
# Set the column dtype to the codes' dtype. The categories
# will be re-assigned at the end
dtypes[idx] = min_scalar_type(len(categories[idx]))
Expand Down
16 changes: 16 additions & 0 deletions python/cudf/cudf/tests/test_concat.py
Original file line number Diff line number Diff line change
Expand Up @@ -1761,3 +1761,19 @@ def test_concat_decimal_non_numeric(s1, s2, expected):
def test_concat_struct_column(s1, s2, expected):
s = gd.concat([s1, s2])
assert_eq(s, expected, check_index_type=True)


def test_concat_categorical_ordering():
# https://github.com/rapidsai/cudf/issues/11486
sr = pd.Series(
["a", "b", "c", "d", "e", "a", "b", "c", "d", "e"], dtype="category"
)
sr = sr.cat.set_categories(["d", "a", "b", "c", "e"])

df = pd.DataFrame({"a": sr})
gdf = gd.from_pandas(df)

expect = pd.concat([df, df, df])
got = gd.concat([gdf, gdf, gdf])

assert_eq(expect, got)
9 changes: 9 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1928,3 +1928,12 @@ def test_default_integer_bitwidth_construction(default_integer_bitwidth, data):
def test_default_float_bitwidth_construction(default_float_bitwidth, data):
s = cudf.Series(data)
assert s.dtype == np.dtype(f"f{default_float_bitwidth//8}")


def test_series_ordered_dedup():
# part of https://github.com/rapidsai/cudf/issues/11486
sr = cudf.Series(np.random.randint(0, 100, 1000))
# pandas unique() preserves order
expect = pd.Series(sr.to_pandas().unique())
got = cudf.Series(sr._column.unique(preserve_order=True))
assert_eq(expect.values, got.values)

0 comments on commit 488c7ad

Please sign in to comment.