Skip to content

Commit

Permalink
Fix Index.difference to handle duplicate values when one of the inp…
Browse files Browse the repository at this point in the history
…uts is empty (#15016)

This PR removes duplicate values in two short-circuit code-paths of `Index.difference` which is already fixed in `pandas-2.2`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #15016
  • Loading branch information
galipremsagar authored Feb 10, 2024
1 parent 0c0c7e6 commit 8edbeca
Show file tree
Hide file tree
Showing 2 changed files with 21 additions and 4 deletions.
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -1096,12 +1096,12 @@ def difference(self, other, sort=None):
other = cudf.Index(other, name=getattr(other, "name", self.name))

if not len(other):
res = self._get_reconciled_name_object(other)
res = self._get_reconciled_name_object(other).unique()
if sort:
return res.sort_values()
return res
elif self.equals(other):
res = self[:0]._get_reconciled_name_object(other)
res = self[:0]._get_reconciled_name_object(other).unique()
if sort:
return res.sort_values()
return res
Expand Down
21 changes: 19 additions & 2 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@
import cudf
from cudf.api.extensions import no_default
from cudf.api.types import is_bool_dtype
from cudf.core._compat import PANDAS_GE_200
from cudf.core._compat import PANDAS_GE_200, PANDAS_GE_220
from cudf.core.index import (
CategoricalIndex,
DatetimeIndex,
Expand Down Expand Up @@ -797,9 +797,26 @@ def test_index_to_series(data):
"name_data,name_other",
[("abc", "c"), (None, "abc"), ("abc", pd.NA), ("abc", "abc")],
)
def test_index_difference(data, other, sort, name_data, name_other):
def test_index_difference(request, data, other, sort, name_data, name_other):
pd_data = pd.Index(data, name=name_data)
pd_other = pd.Index(other, name=name_other)
request.applymarker(
pytest.mark.xfail(
condition=PANDAS_GE_220
and isinstance(pd_data.dtype, pd.CategoricalDtype)
and not isinstance(pd_other.dtype, pd.CategoricalDtype)
and pd_other.isnull().any(),
reason="https://github.com/pandas-dev/pandas/issues/57318",
)
)
request.applymarker(
pytest.mark.xfail(
condition=not PANDAS_GE_220
and len(pd_other) == 0
and len(pd_data) != len(pd_data.unique()),
reason="Bug fixed in pandas-2.2+",
)
)

gd_data = cudf.from_pandas(pd_data)
gd_other = cudf.from_pandas(pd_other)
Expand Down

0 comments on commit 8edbeca

Please sign in to comment.