Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generalize some more indexed frame methods #9529

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
98 changes: 35 additions & 63 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -917,8 +917,12 @@ def dtypes(self):
string object
dtype: object
"""
return cudf.utils.utils._create_pandas_series(
data=[x.dtype for x in self._data.columns], index=self._data.names,
return pd.Series(self._dtypes)

@property
def _dtypes(self):
return dict(
zip(self._data.names, (col.dtype for col in self._data.columns))
)

@property
Expand Down Expand Up @@ -2123,39 +2127,6 @@ def _rename_columns(self, new_names):
mapper = dict(zip(old_cols, new_names))
self.rename(mapper=mapper, inplace=True, axis=1)

@property
def index(self):
"""Returns the index of the DataFrame"""
return self._index

@index.setter
def index(self, value):
old_length = (
self._num_rows if self._index is None else len(self._index)
)
if isinstance(value, cudf.core.multiindex.MultiIndex):
if len(self._data) > 0 and len(value) != old_length:
msg = (
f"Length mismatch: Expected axis has {old_length} "
f"elements, new values have {len(value)} elements"
)
raise ValueError(msg)
self._index = value
return

new_length = len(value)

if len(self._data) > 0 and new_length != old_length:
msg = (
f"Length mismatch: Expected axis has {old_length} elements, "
f"new values have {new_length} elements"
)
raise ValueError(msg)

# try to build an index from generic _index
idx = as_index(value)
self._index = idx

def _reindex(
self, columns, dtypes=None, deep=False, index=None, inplace=False
):
Expand Down Expand Up @@ -2188,16 +2159,13 @@ def _reindex(
if index is not None:
index = cudf.core.index.as_index(index)

if isinstance(index, cudf.MultiIndex):
idx_dtype_match = all(
left_dtype == right_dtype
for left_dtype, right_dtype in zip(
(col.dtype for col in df.index._data.columns),
(col.dtype for col in index._data.columns),
)
idx_dtype_match = (df.index.nlevels == index.nlevels) and all(
left_dtype == right_dtype
for left_dtype, right_dtype in zip(
(col.dtype for col in df.index._data.columns),
(col.dtype for col in index._data.columns),
)
else:
idx_dtype_match = df.index.dtype == index.dtype
)

if not idx_dtype_match:
columns = (
Expand Down Expand Up @@ -2238,7 +2206,7 @@ def _reindex(
return self._mimic_inplace(result, inplace=inplace)

def reindex(
self, labels=None, axis=0, index=None, columns=None, copy=True
self, labels=None, axis=None, index=None, columns=None, copy=True
):
"""
Return a new DataFrame whose axes conform to a new index
Expand Down Expand Up @@ -2287,23 +2255,34 @@ def reindex(
if labels is None and index is None and columns is None:
return self.copy(deep=copy)

dtypes = dict(self.dtypes)
idx = labels if index is None and axis in (0, "index") else index
cols = (
labels if columns is None and axis in (1, "columns") else columns
)
# pandas simply ignores the labels keyword if it is provided in
# addition to index and columns, but it prohibits the axis arg.
if (index is not None or columns is not None) and axis is not None:
raise TypeError(
"Cannot specify both 'axis' and any of 'index' or 'columns'."
)

axis = self._get_axis_from_axis_arg(axis)
if axis == 0:
if index is None:
index = labels
else:
if columns is None:
columns = labels
df = (
self
if cols is None
else self[list(set(self._column_names) & set(cols))]
if columns is None
else self[list(set(self._column_names) & set(columns))]
)

result = df._reindex(
columns=cols, dtypes=dtypes, deep=copy, index=idx, inplace=False
return df._reindex(
columns=columns,
dtypes=self._dtypes,
deep=copy,
index=index,
inplace=False,
)

return result

def set_index(
self,
keys,
Expand Down Expand Up @@ -6238,13 +6217,6 @@ def corr(self):
df.columns = self.columns
return df

def to_dict(self, orient="dict", into=dict):
raise TypeError(
"cuDF does not support conversion to host memory "
"via `to_dict()` method. Consider using "
"`.to_pandas().to_dict()` to construct a Python dictionary."
)

def to_struct(self, name=None):
"""
Return a struct Series composed of the columns of the DataFrame.
Expand Down
26 changes: 26 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@
import cudf
from cudf.api.types import is_categorical_dtype, is_list_like
from cudf.core.frame import Frame
from cudf.core.index import Index
from cudf.core.multiindex import MultiIndex
from cudf.utils.utils import cached_property

Expand Down Expand Up @@ -98,6 +99,31 @@ class IndexedFrame(Frame):
def __init__(self, data=None, index=None):
super().__init__(data=data, index=index)

def to_dict(self, *args, **kwargs): # noqa: D102
raise TypeError(
"cuDF does not support conversion to host memory "
"via `to_dict()` method. Consider using "
"`.to_pandas().to_dict()` to construct a Python dictionary."
)

@property
def index(self):
"""Get the labels for the rows."""
return self._index

@index.setter
def index(self, value):
old_length = len(self)
new_length = len(value)

# A DataFrame with 0 columns can have an index of arbitrary length.
if len(self._data) > 0 and new_length != old_length:
raise ValueError(
f"Length mismatch: Expected axis has {old_length} elements, "
f"new values have {len(value)} elements"
)
self._index = Index(value)

@cached_property
def loc(self):
"""Select rows and columns by label or boolean mask.
Expand Down
20 changes: 4 additions & 16 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -918,6 +918,10 @@ def set_index(self, index):
e 14
dtype: int64
"""
warnings.warn(
"Series.set_index is deprecated and will be removed in the future",
FutureWarning,
)
index = index if isinstance(index, BaseIndex) else as_index(index)
return self._from_data(self._data, index, self.name)

Expand Down Expand Up @@ -1156,13 +1160,6 @@ def __getitem__(self, arg):

items = SingleColumnFrame.__iter__

def to_dict(self, into=dict):
raise TypeError(
"cuDF does not support conversion to host memory "
"via `to_dict()` method. Consider using "
"`.to_pandas().to_dict()` to construct a Python dictionary."
)

def __setitem__(self, key, value):
if isinstance(key, slice):
self.iloc[key] = value
Expand Down Expand Up @@ -2118,15 +2115,6 @@ def data(self):
""" # noqa: E501
return self._column.data

@property
def index(self):
"""The index object"""
return self._index

@index.setter
def index(self, _index):
self._index = as_index(_index)

@property
def nullmask(self):
"""The gpu buffer for the null-mask"""
Expand Down
6 changes: 6 additions & 0 deletions python/cudf/cudf/tests/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1316,3 +1316,9 @@ def test_series_hash_values(method, validation_data):
validation_results = cudf.Series(validation_data)
hash_values = inputs.hash_values(method=method)
assert_eq(hash_values, validation_results)


def test_set_index_unequal_length():
s = cudf.Series()
with pytest.raises(ValueError):
s.index = [1, 2, 3]