Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix DataFrame.reindex when column reindexing to MultiIndex/RangeIndex #14605

Merged
merged 4 commits into from
Dec 13, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 8 additions & 5 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -2744,11 +2744,14 @@ def reindex(
else:
if columns is None:
columns = labels
df = (
self
if columns is None
else self[list(set(self._column_names) & set(columns))]
)
if columns is None:
df = self
else:
columns = as_index(columns)
intersection = self._data.to_pandas_index().intersection(
Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Happy to have suggestion if there's a way to do this without dispatching to pandas. This was to ensure the MultiIndex intersection case worked correctly

columns.to_pandas()
)
df = self.loc[:, intersection]

return df._reindex(
column_names=columns,
Expand Down
44 changes: 33 additions & 11 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -2584,7 +2584,7 @@ def _reindex(
Parameters
----------
columns_names : array-like
The list of columns to select from the Frame,
array-like of columns to select from the Frame,
if ``columns`` is a superset of ``Frame.columns`` new
columns are created.
dtypes : dict
Expand Down Expand Up @@ -2646,9 +2646,35 @@ def _reindex(
df = df.take(index.argsort(ascending=True).argsort())

index = index if index is not None else df.index
names = (
column_names if column_names is not None else list(df._data.names)
)

if column_names is None:
names = list(df._data.names)
level_names = self._data.level_names
multiindex = self._data.multiindex
rangeindex = self._data.rangeindex
elif isinstance(column_names, (pd.Index, cudf.Index)):
if isinstance(column_names, (pd.MultiIndex, cudf.MultiIndex)):
multiindex = True
if isinstance(column_names, cudf.MultiIndex):
names = list(iter(column_names.to_pandas()))
else:
names = list(iter(column_names))
rangeindex = False
else:
multiindex = False
names = column_names
if isinstance(names, cudf.Index):
names = names.to_pandas()
rangeindex = isinstance(
column_names, (pd.RangeIndex, cudf.RangeIndex)
)
level_names = tuple(column_names.names)
else:
names = column_names
level_names = None
multiindex = False
rangeindex = False

cols = {
name: (
df._data[name].copy(deep=deep)
Expand All @@ -2661,17 +2687,13 @@ def _reindex(
)
for name in names
}
if column_names is None:
level_names = self._data.level_names
elif isinstance(column_names, pd.Index):
level_names = tuple(column_names.names)
else:
level_names = None

result = self.__class__._from_data(
data=cudf.core.column_accessor.ColumnAccessor(
cols,
multiindex=self._data.multiindex,
multiindex=multiindex,
level_names=level_names,
rangeindex=rangeindex,
),
index=index,
)
Expand Down
45 changes: 45 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -3378,6 +3378,51 @@ def test_series_string_reindex(copy):
)


@pytest.mark.parametrize("names", [None, ["a", "b"]])
@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex])
def test_reindex_multiindex_col_to_multiindex(names, klass):
idx = pd.Index(
[("A", "one"), ("A", "two")],
dtype="object",
)
df = pd.DataFrame([[1, 2]], columns=idx)
gdf = cudf.from_pandas(df)
midx = klass.from_tuples([("A", "one"), ("A", "three")], names=names)
result = gdf.reindex(columns=midx)
expected = cudf.DataFrame([[1, None]], columns=midx)
# (pandas2.0): check_dtype=False won't be needed
# as None col will return object instead of float
assert_eq(result, expected, check_dtype=False)


@pytest.mark.parametrize("names", [None, ["a", "b"]])
@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex])
def test_reindex_tuple_col_to_multiindex(names, klass):
idx = pd.Index(
[("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False
)
df = pd.DataFrame([[1, 2]], columns=idx)
gdf = cudf.from_pandas(df)
midx = klass.from_tuples([("A", "one"), ("A", "two")], names=names)
result = gdf.reindex(columns=midx)
expected = cudf.DataFrame([[1, 2]], columns=midx)
assert_eq(result, expected)


@pytest.mark.parametrize("name", [None, "foo"])
@pytest.mark.parametrize("klass", [range, cudf.RangeIndex, pd.RangeIndex])
def test_reindex_columns_rangeindex_keeps_rangeindex(name, klass):
new_columns = klass(3)
exp_name = None
if klass is not range:
new_columns.name = name
exp_name = name
df = cudf.DataFrame([[1, 2]])
result = df.reindex(columns=new_columns).columns
expected = pd.RangeIndex(3, name=exp_name)
assert_eq(result, expected)


def test_to_frame(pdf, gdf):
assert_eq(pdf.x.to_frame(), gdf.x.to_frame())

Expand Down