From f9230049e705590ab6f81a898ea398214be988c2 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Dec 2023 16:08:51 -0800 Subject: [PATCH 1/3] Ensure RangeIndex is kept when reindexing --- python/cudf/cudf/core/dataframe.py | 11 ++++--- python/cudf/cudf/core/indexed_frame.py | 42 +++++++++++++++++------- python/cudf/cudf/tests/test_dataframe.py | 28 ++++++++++++++++ 3 files changed, 65 insertions(+), 16 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index f2e57d219e3..ab9f5d15dee 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2744,11 +2744,12 @@ def reindex( else: if columns is None: columns = labels - df = ( - self - if columns is None - else self[list(set(self._column_names) & set(columns))] - ) + if columns is None: + df = self + else: + columns = as_index(columns) + intersection = self.columns.intersection(columns.to_pandas()) + df = self.loc[:, intersection] return df._reindex( column_names=columns, diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 86a8e64b213..a221094626c 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2584,7 +2584,7 @@ def _reindex( Parameters ---------- columns_names : array-like - The list of columns to select from the Frame, + array-like of columns to select from the Frame, if ``columns`` is a superset of ``Frame.columns`` new columns are created. dtypes : dict @@ -2646,9 +2646,33 @@ def _reindex( df = df.take(index.argsort(ascending=True).argsort()) index = index if index is not None else df.index - names = ( - column_names if column_names is not None else list(df._data.names) - ) + + if column_names is None: + names = list(df._data.names) + level_names = self._data.level_names + multiindex = self._data.multiindex + rangeindex = self._data.rangeindex + elif isinstance(column_names, (pd.Index, cudf.Index)): + if isinstance(column_names, (pd.MultiIndex, cudf.MultiIndex)): + multiindex = True + if isinstance(column_names, cudf.MultiIndex): + names = list(iter(column_names.to_pandas())) + else: + names = list(iter(column_names)) + rangeindex = False + else: + multiindex = False + names = column_names + rangeindex = isinstance( + column_names, (pd.RangeIndex, cudf.RangeIndex) + ) + level_names = tuple(column_names.names) + else: + names = column_names + level_names = None + multiindex = False + rangeindex = False + cols = { name: ( df._data[name].copy(deep=deep) @@ -2661,17 +2685,13 @@ def _reindex( ) for name in names } - if column_names is None: - level_names = self._data.level_names - elif isinstance(column_names, pd.Index): - level_names = tuple(column_names.names) - else: - level_names = None + result = self.__class__._from_data( data=cudf.core.column_accessor.ColumnAccessor( cols, - multiindex=self._data.multiindex, + multiindex=multiindex, level_names=level_names, + rangeindex=rangeindex, ), index=index, ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 6ddab6ed3f5..0bbec73260a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3378,6 +3378,34 @@ def test_series_string_reindex(copy): ) +def test_reindex_tuple_col_to_multiindex(): + idx = pd.Index( + [("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False + ) + df = pd.DataFrame([[1, 2]], columns=idx) + gdf = cudf.from_pandas(df) + midx = cudf.MultiIndex.from_tuples( + [("A", "one"), ("A", "two")], names=["a", "b"] + ) + result = gdf.reindex(columns=midx) + expected = cudf.DataFrame([[1, 2]], columns=midx) + assert_eq(result, expected) + + +@pytest.mark.parametrize("name", [None, "foo"]) +@pytest.mark.parametrize("klass", [range, cudf.RangeIndex, pd.RangeIndex]) +def test_reindex_columns_rangeindex_keeps_rangeindex(name, klass): + new_columns = klass(3) + exp_name = None + if klass is not range: + new_columns.name = name + exp_name = name + df = cudf.DataFrame([[1, 2]]) + result = df.reindex(columns=new_columns).columns + expected = pd.RangeIndex(3, name=exp_name) + assert_eq(result, expected) + + def test_to_frame(pdf, gdf): assert_eq(pdf.x.to_frame(), gdf.x.to_frame()) From 3e6902a02577a885d027cebe88eeb390fe67d591 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Fri, 8 Dec 2023 16:15:57 -0800 Subject: [PATCH 2/3] Fix DataFrame.reindex when reindexing to MultiIndex/RangeIndex --- python/cudf/cudf/tests/test_dataframe.py | 25 ++++++++++++++++++++---- 1 file changed, 21 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 0bbec73260a..ce3c3641a65 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -3378,15 +3378,32 @@ def test_series_string_reindex(copy): ) -def test_reindex_tuple_col_to_multiindex(): +@pytest.mark.parametrize("names", [None, ["a", "b"]]) +@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) +def test_reindex_multiindex_col_to_multiindex(names, klass): idx = pd.Index( - [("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False + [("A", "one"), ("A", "two")], + dtype="object", ) df = pd.DataFrame([[1, 2]], columns=idx) gdf = cudf.from_pandas(df) - midx = cudf.MultiIndex.from_tuples( - [("A", "one"), ("A", "two")], names=["a", "b"] + midx = klass.from_tuples([("A", "one"), ("A", "three")], names=names) + result = gdf.reindex(columns=midx) + expected = cudf.DataFrame([[1, None]], columns=midx) + # (pandas2.0): check_dtype=False won't be needed + # as None col will return object instead of float + assert_eq(result, expected, check_dtype=False) + + +@pytest.mark.parametrize("names", [None, ["a", "b"]]) +@pytest.mark.parametrize("klass", [cudf.MultiIndex, pd.MultiIndex]) +def test_reindex_tuple_col_to_multiindex(names, klass): + idx = pd.Index( + [("A", "one"), ("A", "two")], dtype="object", tupleize_cols=False ) + df = pd.DataFrame([[1, 2]], columns=idx) + gdf = cudf.from_pandas(df) + midx = klass.from_tuples([("A", "one"), ("A", "two")], names=names) result = gdf.reindex(columns=midx) expected = cudf.DataFrame([[1, 2]], columns=midx) assert_eq(result, expected) From 2258a73f2cbea07f60b4e9efd2fb9ecc312864ef Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 11 Dec 2023 11:29:46 -0800 Subject: [PATCH 3/3] Avoid external API --- python/cudf/cudf/core/dataframe.py | 4 +++- python/cudf/cudf/core/indexed_frame.py | 2 ++ 2 files changed, 5 insertions(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index ab9f5d15dee..9fcbda741f6 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2748,7 +2748,9 @@ def reindex( df = self else: columns = as_index(columns) - intersection = self.columns.intersection(columns.to_pandas()) + intersection = self._data.to_pandas_index().intersection( + columns.to_pandas() + ) df = self.loc[:, intersection] return df._reindex( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index a221094626c..dadfa16e52f 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2663,6 +2663,8 @@ def _reindex( else: multiindex = False names = column_names + if isinstance(names, cudf.Index): + names = names.to_pandas() rangeindex = isinstance( column_names, (pd.RangeIndex, cudf.RangeIndex) )