From e9d5da44821f1f01203a1e2bd2be97d413297050 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Oct 2021 11:36:51 -0700 Subject: [PATCH 1/6] Move to_dict. --- python/cudf/cudf/core/dataframe.py | 7 ------- python/cudf/cudf/core/indexed_frame.py | 7 +++++++ python/cudf/cudf/core/series.py | 7 ------- 3 files changed, 7 insertions(+), 14 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 5239cf9d648..827968bf01e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6238,13 +6238,6 @@ def corr(self): df.columns = self.columns return df - def to_dict(self, orient="dict", into=dict): - raise TypeError( - "cuDF does not support conversion to host memory " - "via `to_dict()` method. Consider using " - "`.to_pandas().to_dict()` to construct a Python dictionary." - ) - def to_struct(self, name=None): """ Return a struct Series composed of the columns of the DataFrame. diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index aba24171f06..aa4d3855051 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -98,6 +98,13 @@ class IndexedFrame(Frame): def __init__(self, data=None, index=None): super().__init__(data=data, index=index) + def to_dict(self, *args, **kwargs): # noqa: D102 + raise TypeError( + "cuDF does not support conversion to host memory " + "via `to_dict()` method. Consider using " + "`.to_pandas().to_dict()` to construct a Python dictionary." + ) + @cached_property def loc(self): """Select rows and columns by label or boolean mask. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index aada0534f42..a0421accc52 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1156,13 +1156,6 @@ def __getitem__(self, arg): items = SingleColumnFrame.__iter__ - def to_dict(self, into=dict): - raise TypeError( - "cuDF does not support conversion to host memory " - "via `to_dict()` method. Consider using " - "`.to_pandas().to_dict()` to construct a Python dictionary." - ) - def __setitem__(self, key, value): if isinstance(key, slice): self.iloc[key] = value From 618d3adb37971ffc1515a4a2047b76c3c722c449 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Oct 2021 12:17:21 -0700 Subject: [PATCH 2/6] Merge index property. --- python/cudf/cudf/core/dataframe.py | 33 -------------------------- python/cudf/cudf/core/indexed_frame.py | 19 +++++++++++++++ python/cudf/cudf/core/series.py | 9 ------- 3 files changed, 19 insertions(+), 42 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 827968bf01e..61354a27886 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2123,39 +2123,6 @@ def _rename_columns(self, new_names): mapper = dict(zip(old_cols, new_names)) self.rename(mapper=mapper, inplace=True, axis=1) - @property - def index(self): - """Returns the index of the DataFrame""" - return self._index - - @index.setter - def index(self, value): - old_length = ( - self._num_rows if self._index is None else len(self._index) - ) - if isinstance(value, cudf.core.multiindex.MultiIndex): - if len(self._data) > 0 and len(value) != old_length: - msg = ( - f"Length mismatch: Expected axis has {old_length} " - f"elements, new values have {len(value)} elements" - ) - raise ValueError(msg) - self._index = value - return - - new_length = len(value) - - if len(self._data) > 0 and new_length != old_length: - msg = ( - f"Length mismatch: Expected axis has {old_length} elements, " - f"new values have {new_length} elements" - ) - raise ValueError(msg) - - # try to build an index from generic _index - idx = as_index(value) - self._index = idx - def _reindex( self, columns, dtypes=None, deep=False, index=None, inplace=False ): diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index aa4d3855051..e7066e336ab 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -12,6 +12,7 @@ import cudf from cudf.api.types import is_categorical_dtype, is_list_like from cudf.core.frame import Frame +from cudf.core.index import Index from cudf.core.multiindex import MultiIndex from cudf.utils.utils import cached_property @@ -105,6 +106,24 @@ def to_dict(self, *args, **kwargs): # noqa: D102 "`.to_pandas().to_dict()` to construct a Python dictionary." ) + @property + def index(self): + """Get the labels for the rows.""" + return self._index + + @index.setter + def index(self, value): + old_length = len(self) + new_length = len(value) + + # A DataFrame with 0 columns can have an index of arbitrary length. + if len(self._data) > 0 and new_length != old_length: + raise ValueError( + f"Length mismatch: Expected axis has {old_length} elements, " + f"new values have {len(value)} elements" + ) + self._index = Index(value) + @cached_property def loc(self): """Select rows and columns by label or boolean mask. diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index a0421accc52..05441f83159 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2111,15 +2111,6 @@ def data(self): """ # noqa: E501 return self._column.data - @property - def index(self): - """The index object""" - return self._index - - @index.setter - def index(self, _index): - self._index = as_index(_index) - @property def nullmask(self): """The gpu buffer for the null-mask""" From b7c32eb0cad7961742edacd4e8f34474eb824277 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 14 Oct 2021 12:32:02 -0700 Subject: [PATCH 3/6] Add test of series index setting. --- python/cudf/cudf/tests/test_series.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 7d62532776e..f81b0819021 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -1316,3 +1316,9 @@ def test_series_hash_values(method, validation_data): validation_results = cudf.Series(validation_data) hash_values = inputs.hash_values(method=method) assert_eq(hash_values, validation_results) + + +def test_set_index_unequal_length(): + s = cudf.Series() + with pytest.raises(ValueError): + s.index = [1, 2, 3] From d71a882988df6e1c337a3b319b6b7739004dc223 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Oct 2021 18:16:51 -0700 Subject: [PATCH 4/6] Deprecate set_index. --- python/cudf/cudf/core/series.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 05441f83159..ded57c6236c 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -918,6 +918,10 @@ def set_index(self, index): e 14 dtype: int64 """ + warnings.warn( + "Series.set_index is deprecated and will be removed in the future", + FutureWarning, + ) index = index if isinstance(index, BaseIndex) else as_index(index) return self._from_data(self._data, index, self.name) From bc6d3b3f72001bf0cf5ef3eca0ccbe948079b1fd Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 18 Oct 2021 22:47:57 -0700 Subject: [PATCH 5/6] Start cleaning up reindex. --- python/cudf/cudf/core/dataframe.py | 59 ++++++++++++++++++------------ 1 file changed, 36 insertions(+), 23 deletions(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 61354a27886..7c0a13f99f7 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -917,8 +917,12 @@ def dtypes(self): string object dtype: object """ - return cudf.utils.utils._create_pandas_series( - data=[x.dtype for x in self._data.columns], index=self._data.names, + return pd.Series(self._dtypes) + + @property + def _dtypes(self): + return dict( + zip(self._data.names, (col.dtype for col in self._data.columns)) ) @property @@ -2155,18 +2159,16 @@ def _reindex( if index is not None: index = cudf.core.index.as_index(index) - if isinstance(index, cudf.MultiIndex): - idx_dtype_match = all( - left_dtype == right_dtype - for left_dtype, right_dtype in zip( - (col.dtype for col in df.index._data.columns), - (col.dtype for col in index._data.columns), - ) + idx_dtype_match = (df.index.nlevels == index.nlevels) and all( + left_dtype == right_dtype + for left_dtype, right_dtype in zip( + (col.dtype for col in df.index._data.columns), + (col.dtype for col in index._data.columns), ) - else: - idx_dtype_match = df.index.dtype == index.dtype + ) if not idx_dtype_match: + # TODO: This should be an early return columns = ( columns if columns is not None else list(df._column_names) ) @@ -2205,7 +2207,7 @@ def _reindex( return self._mimic_inplace(result, inplace=inplace) def reindex( - self, labels=None, axis=0, index=None, columns=None, copy=True + self, labels=None, axis=None, index=None, columns=None, copy=True ): """ Return a new DataFrame whose axes conform to a new index @@ -2254,23 +2256,34 @@ def reindex( if labels is None and index is None and columns is None: return self.copy(deep=copy) - dtypes = dict(self.dtypes) - idx = labels if index is None and axis in (0, "index") else index - cols = ( - labels if columns is None and axis in (1, "columns") else columns - ) + # pandas simply ignores the labels keyword if it is provided in + # addition to index and columns, but it prohibits the axis arg. + if (index is not None or columns is not None) and axis is not None: + raise TypeError( + "Cannot specify both 'axis' and any of 'index' or 'columns'." + ) + + axis = self._get_axis_from_axis_arg(axis) + if axis == 0: + if index is None: + index = labels + else: + if columns is None: + columns = labels df = ( self - if cols is None - else self[list(set(self._column_names) & set(cols))] + if columns is None + else self[list(set(self._column_names) & set(columns))] ) - result = df._reindex( - columns=cols, dtypes=dtypes, deep=copy, index=idx, inplace=False + return df._reindex( + columns=columns, + dtypes=self._dtypes, + deep=copy, + index=index, + inplace=False, ) - return result - def set_index( self, keys, From 60252705eef82ee901f12f42726fb3d3add77fce Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Tue, 26 Oct 2021 15:26:35 -0700 Subject: [PATCH 6/6] Remove TODO. --- python/cudf/cudf/core/dataframe.py | 1 - 1 file changed, 1 deletion(-) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 7c0a13f99f7..ec22978be4e 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -2168,7 +2168,6 @@ def _reindex( ) if not idx_dtype_match: - # TODO: This should be an early return columns = ( columns if columns is not None else list(df._column_names) )