From 4bd793f55d738ee9b59501979b0899514a7853eb Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 22 Nov 2023 12:00:00 -0800 Subject: [PATCH 1/5] process pandas dataframes by columns --- python/cudf/cudf/core/dataframe.py | 48 ++++++++++++------------------ 1 file changed, 19 insertions(+), 29 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 43ae9b9e81e..0ed5ad6d78d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5246,29 +5246,20 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): raise ValueError("Duplicate column names are not allowed") # Set columns - data = {} - for col_name, col_value in dataframe.items(): - # necessary because multi-index can return multiple - # columns for a single key - if len(col_value.shape) == 1: - data[col_name] = column.as_column( - col_value.array, nan_as_null=nan_as_null - ) - else: - vals = col_value.values.T - if vals.shape[0] == 1: - data[col_name] = column.as_column( - vals.flatten(), nan_as_null=nan_as_null - ) - else: - if isinstance(col_name, tuple): - col_name = str(col_name) - for idx in range(len(vals.shape)): - data[col_name] = column.as_column( - vals[idx], nan_as_null=nan_as_null - ) - - index = cudf.from_pandas(dataframe.index, nan_as_null=nan_as_null) + data = { + col_name: column.as_column( + col_value.array, nan_as_null=nan_as_null + ) + for col_name, col_value in dataframe.items() + } + if isinstance(dataframe.index, pd.MultiIndex): + index = cudf.MultiIndex.from_pandas( + dataframe.index, nan_as_null=nan_as_null + ) + else: + index = cudf.Index.from_pandas( + dataframe.index, nan_as_null=nan_as_null + ) df = cls._from_data(data, index) df._data._level_names = tuple(dataframe.columns.names) @@ -5279,13 +5270,12 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): df.columns = dataframe.columns return df + elif hasattr(df, "__dataframe__"): + return from_dataframe(dataframe, allow_copy=True) else: - try: - return from_dataframe(dataframe, allow_copy=True) - except Exception: - raise TypeError( - f"Could not construct DataFrame from {type(dataframe)}" - ) + raise TypeError( + f"Could not construct DataFrame from {type(dataframe)}" + ) @classmethod @_cudf_nvtx_annotate From 639c396faf2e1c60195d2d5679a069cd4d359caf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 22 Nov 2023 17:13:21 -0800 Subject: [PATCH 2/5] Fix bug in Index.from_pandas --- python/cudf/cudf/core/_base_index.py | 16 ++++++++++++---- python/cudf/cudf/core/dataframe.py | 3 ++- python/cudf/cudf/tests/test_index.py | 7 +++++++ 3 files changed, 21 insertions(+), 5 deletions(-) diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 3616ec1b542..7cf9316abaf 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -1873,10 +1873,18 @@ def from_pandas(cls, index, nan_as_null=no_default): if not isinstance(index, pd.Index): raise TypeError("not a pandas.Index") - - ind = cudf.Index(column.as_column(index, nan_as_null=nan_as_null)) - ind.name = index.name - return ind + if isinstance(index, pd.RangeIndex): + return cudf.RangeIndex( + start=index.start, + stop=index.stop, + step=index.step, + name=index.name, + ) + else: + return cudf.Index( + column.as_column(index, nan_as_null=nan_as_null), + name=index.name, + ) @property def _constructor_expanddim(self): diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 0ed5ad6d78d..99ca9248e2c 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5245,7 +5245,6 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): if not dataframe.columns.is_unique: raise ValueError("Duplicate column names are not allowed") - # Set columns data = { col_name: column.as_column( col_value.array, nan_as_null=nan_as_null @@ -5271,6 +5270,8 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): return df elif hasattr(df, "__dataframe__"): + # TODO: Probably should be handled in the constructor as + # this isn't pandas specific return from_dataframe(dataframe, allow_copy=True) else: raise TypeError( diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 087b93f1a02..bd86152b027 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2949,3 +2949,10 @@ def test_index_getitem_from_int(idx): def test_index_getitem_from_nonint_raises(idx): with pytest.raises(ValueError): cudf.Index([1, 2])[idx] + + +def test_from_pandas_rangeindex_return_rangeindex(): + pidx = pd.RangeIndex(start=3, stop=9, step=3, name="a") + result = cudf.Index.from_pandas(pidx) + expected = cudf.RangeIndex(start=3, stop=9, step=3, name="a") + assert_eq(result, expected, exact=True) From ee976b6c83cf87168bee1cfaf7c4c9ca6b5c5b23 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Wed, 22 Nov 2023 17:19:17 -0800 Subject: [PATCH 3/5] simplify global to_pandas --- python/cudf/cudf/core/dataframe.py | 4 ---- 1 file changed, 4 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 99ca9248e2c..5b9acaf5f60 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -7906,10 +7906,6 @@ def from_pandas(obj, nan_as_null=no_default): return ret elif isinstance(obj, pd.MultiIndex): return MultiIndex.from_pandas(obj, nan_as_null=nan_as_null) - elif isinstance(obj, pd.RangeIndex): - return cudf.core.index.RangeIndex( - start=obj.start, stop=obj.stop, step=obj.step, name=obj.name - ) elif isinstance(obj, pd.Index): return cudf.Index.from_pandas(obj, nan_as_null=nan_as_null) elif isinstance(obj, pd.CategoricalDtype): From 8c831744b7883f5f4b69cceb08506f0ace2ad6cf Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 27 Nov 2023 11:27:33 -0800 Subject: [PATCH 4/5] Fix variable --- python/cudf/cudf/core/dataframe.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5b9acaf5f60..b25a77691f0 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -5269,7 +5269,7 @@ def from_pandas(cls, dataframe, nan_as_null=no_default): df.columns = dataframe.columns return df - elif hasattr(df, "__dataframe__"): + elif hasattr(dataframe, "__dataframe__"): # TODO: Probably should be handled in the constructor as # this isn't pandas specific return from_dataframe(dataframe, allow_copy=True) From 2a4ddc1e96d81a7cba93b58d2fe787d4f53d4502 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Mon, 27 Nov 2023 16:43:00 -0800 Subject: [PATCH 5/5] Trigger CI