diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 721ebf22de7..eda9d6c992d 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -35,7 +35,6 @@ from cudf.core.index import BaseIndex, RangeIndex, as_index from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer from cudf.core.series import Series -from cudf.core.window import Rolling from cudf.utils import applyutils, docutils, ioutils, queryutils, utils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( @@ -526,11 +525,12 @@ def serialize(self): # Use the column directly to avoid duplicating the index # need to pickle column names to handle numpy integer columns - header["column_names"] = pickle.dumps(tuple(self._data.names)) - column_header, column_frames = column.serialize_columns(self._columns) - header["columns"] = column_header + header["columns"], column_frames = column.serialize_columns( + self._columns + ) frames.extend(column_frames) + header["column_names"] = pickle.dumps(tuple(self._data.names)) return header, frames @classmethod @@ -547,7 +547,7 @@ def deserialize(cls, header, frames): column_names = pickle.loads(header["column_names"]) columns = column.deserialize_columns(header["columns"], column_frames) - return cls(dict(zip(column_names, columns)), index=index) + return cls._from_data(dict(zip(column_names, columns)), index=index,) @property def dtypes(self): @@ -1029,68 +1029,6 @@ def assign(self, **kwargs): new[k] = v return new - def head(self, n=5): - """ - Returns the first n rows as a new DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df.head(2) - key val - 0 0 10.0 - 1 1 11.0 - """ - return self.iloc[:n] - - def tail(self, n=5): - """ - Returns the last n rows as a new DataFrame - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2, 3, 4] - >>> df['val'] = [float(i + 10) for i in range(5)] # insert column - >>> df.tail(2) - key val - 3 3 13.0 - 4 4 14.0 - """ - if n == 0: - return self.iloc[0:0] - - return self.iloc[-n:] - - def to_string(self): - """ - Convert to string - - cuDF uses Pandas internals for efficient string formatting. - Set formatting options using pandas string formatting options and - cuDF objects will print identically to Pandas objects. - - cuDF supports `null/None` as a value in any column type, which - is transparently supported during this output process. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame() - >>> df['key'] = [0, 1, 2] - >>> df['val'] = [float(i + 10) for i in range(3)] - >>> df.to_string() - ' key val\\n0 0 10.0\\n1 1 11.0\\n2 2 12.0' - """ - return self.__repr__() - - def __str__(self): - return self.to_string() - def astype(self, dtype, copy=False, errors="raise", **kwargs): """ Cast the DataFrame to the given dtype @@ -1644,14 +1582,6 @@ def update( self._mimic_inplace(source_df, inplace=True) - def __invert__(self): - # Defer logic to Series since pandas semantics dictate different - # behaviors for different types that requires too much special casing - # of the standard _unaryop. - return DataFrame( - data={col: ~self[col] for col in self}, index=self.index - ) - def radd(self, other, axis=1, level=None, fill_value=None): """ Get Addition of dataframe and other, element-wise (binary @@ -3505,15 +3435,6 @@ def rename( else: return out.copy(deep=copy) - def nans_to_nulls(self): - """ - Convert nans (if any) to nulls. - """ - df = self.copy() - for col in df.columns: - df[col] = df[col].nans_to_nulls() - return df - def as_gpu_matrix(self, columns=None, order="F"): """Convert to a matrix in device memory. @@ -4506,19 +4427,6 @@ def groupby( sort=sort, ) - @copy_docstring(Rolling) - def rolling( - self, window, min_periods=None, center=False, axis=0, win_type=None - ): - return Rolling( - self, - window, - min_periods=min_periods, - center=center, - axis=axis, - win_type=win_type, - ) - def query(self, expr, local_dict=None): """ Query with a boolean expression using Numba to compile a GPU kernel. @@ -6732,27 +6640,6 @@ def to_feather(self, path, *args, **kwargs): feather.to_feather(self, path, *args, **kwargs) - @ioutils.doc_to_json() - def to_json(self, path_or_buf=None, *args, **kwargs): - """{docstring}""" - from cudf.io import json as json - - return json.to_json(self, path_or_buf=path_or_buf, *args, **kwargs) - - @ioutils.doc_to_hdf() - def to_hdf(self, path_or_buf, key, *args, **kwargs): - """{docstring}""" - from cudf.io import hdf as hdf - - hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) - - @ioutils.doc_to_dlpack() - def to_dlpack(self): - """{docstring}""" - from cudf.io import dlpack as dlpack - - return dlpack.to_dlpack(self) - @ioutils.doc_dataframe_to_csv() def to_csv( self, diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 9f743cd8c85..8b3677212da 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -27,6 +27,8 @@ ) from cudf.core.column_accessor import ColumnAccessor from cudf.core.join import merge +from cudf.core.window import Rolling +from cudf.utils import ioutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( _is_non_decimal_numeric_dtype, @@ -4531,6 +4533,242 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): "prod", axis=axis, skipna=skipna, cast_to_int=True, *args, **kwargs ) + @ioutils.doc_to_json() + def to_json(self, path_or_buf=None, *args, **kwargs): + """{docstring}""" + + return cudf.io.json.to_json( + self, path_or_buf=path_or_buf, *args, **kwargs + ) + + @ioutils.doc_to_hdf() + def to_hdf(self, path_or_buf, key, *args, **kwargs): + """{docstring}""" + + cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) + + @ioutils.doc_to_dlpack() + def to_dlpack(self): + """{docstring}""" + + return cudf.io.dlpack.to_dlpack(self) + + def to_string(self): + """ + Convert to string + + cuDF uses Pandas internals for efficient string formatting. + Set formatting options using pandas string formatting options and + cuDF objects will print identically to Pandas objects. + + cuDF supports `null/None` as a value in any column type, which + is transparently supported during this output process. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2] + >>> df['val'] = [float(i + 10) for i in range(3)] + >>> df.to_string() + ' key val\\n0 0 10.0\\n1 1 11.0\\n2 2 12.0' + """ + return self.__repr__() + + def __str__(self): + return self.to_string() + + def head(self, n=5): + """ + Return the first `n` rows. + This function returns the first `n` rows for the object based + on position. It is useful for quickly testing if your object + has the right type of data in it. + For negative values of `n`, this function returns all rows except + the last `n` rows, equivalent to ``df[:-n]``. + + Parameters + ---------- + n : int, default 5 + Number of rows to select. + + Returns + ------- + DataFrame or Series + The first `n` rows of the caller object. + + See Also + -------- + Frame.tail: Returns the last `n` rows. + + Examples + -------- + + **Series** + + >>> ser = cudf.Series(['alligator', 'bee', 'falcon', + ... 'lion', 'monkey', 'parrot', 'shark', 'whale', 'zebra']) + >>> ser + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + 6 shark + 7 whale + 8 zebra + dtype: object + + Viewing the first 5 lines + + >>> ser.head() + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + dtype: object + + Viewing the first `n` lines (three in this case) + + >>> ser.head(3) + 0 alligator + 1 bee + 2 falcon + dtype: object + + For negative values of `n` + + >>> ser.head(-3) + 0 alligator + 1 bee + 2 falcon + 3 lion + 4 monkey + 5 parrot + dtype: object + + **DataFrame** + + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2, 3, 4] + >>> df['val'] = [float(i + 10) for i in range(5)] # insert column + >>> df.head(2) + key val + 0 0 10.0 + 1 1 11.0 + """ + return self.iloc[:n] + + def tail(self, n=5): + """ + Returns the last n rows as a new DataFrame or Series + + Examples + -------- + + **DataFrame** + + >>> import cudf + >>> df = cudf.DataFrame() + >>> df['key'] = [0, 1, 2, 3, 4] + >>> df['val'] = [float(i + 10) for i in range(5)] # insert column + >>> df.tail(2) + key val + 3 3 13.0 + 4 4 14.0 + + **Series** + + >>> import cudf + >>> ser = cudf.Series([4, 3, 2, 1, 0]) + >>> ser.tail(2) + 3 1 + 4 0 + """ + if n == 0: + return self.iloc[0:0] + + return self.iloc[-n:] + + @copy_docstring(Rolling) + def rolling( + self, window, min_periods=None, center=False, axis=0, win_type=None + ): + return Rolling( + self, + window, + min_periods=min_periods, + center=center, + axis=axis, + win_type=win_type, + ) + + def nans_to_nulls(self): + """ + Convert nans (if any) to nulls + + Returns + ------- + DataFrame or Series + + Examples + -------- + + **Series** + + >>> import cudf, numpy as np + >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False) + >>> series + 0 1.0 + 1 2.0 + 2 NaN + 3 + 4 10.0 + dtype: float64 + >>> series.nans_to_nulls() + 0 1.0 + 1 2.0 + 2 + 3 + 4 10.0 + dtype: float64 + + **DataFrame** + + >>> df = cudf.DataFrame() + >>> df['a'] = cudf.Series([1, None, np.nan], nan_as_null=False) + >>> df['b'] = cudf.Series([None, 3.14, np.nan], nan_as_null=False) + >>> df + a b + 0 1.0 + 1 3.14 + 2 NaN NaN + >>> df.nans_to_nulls() + a b + 0 1.0 + 1 3.14 + 2 + """ + return self._from_data( + { + name: col.copy().nans_to_nulls() + for name, col in self._data.items() + }, + self._index, + ) + + def __invert__(self): + """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" + return self._from_data( + { + name: _apply_inverse_column(col) + for name, col in self._data.items() + }, + self._index, + ) + class SingleColumnFrame(Frame): """A one-dimensional frame. @@ -5259,3 +5497,15 @@ def _drop_rows_by_labels( return res else: return obj.join(key_df, how="leftanti") + + +def _apply_inverse_column(col: ColumnBase) -> ColumnBase: + """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" + if np.issubdtype(col.dtype, np.integer): + return col.unary_operator("invert") + elif np.issubdtype(col.dtype, np.bool_): + return col.unary_operator("not") + else: + raise TypeError( + f"Operation `~` not supported on {col.dtype.type.__name__}" + ) diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index ff3b9fc68ef..7943d033cf8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -41,8 +41,7 @@ from cudf.core.groupby.groupby import SeriesGroupBy from cudf.core.index import BaseIndex, Index, RangeIndex, as_index from cudf.core.indexing import _SeriesIlocIndexer, _SeriesLocIndexer -from cudf.core.window import Rolling -from cudf.utils import cudautils, docutils, ioutils +from cudf.utils import cudautils, docutils from cudf.utils.docutils import copy_docstring from cudf.utils.dtypes import ( can_convert_to_column, @@ -329,15 +328,16 @@ def from_pandas(cls, s, nan_as_null=None): def serialize(self): header = {} frames = [] + header["type-serialized"] = pickle.dumps(type(self)) header["index"], index_frames = self._index.serialize() - header["name"] = pickle.dumps(self.name) - frames.extend(index_frames) header["index_frame_count"] = len(index_frames) + frames.extend(index_frames) + header["column"], column_frames = self._column.serialize() - header["type-serialized"] = pickle.dumps(type(self)) - frames.extend(column_frames) header["column_frame_count"] = len(column_frames) + frames.extend(column_frames) + header["name"] = pickle.dumps(self.name) return header, frames @property @@ -381,7 +381,7 @@ def deserialize(cls, header, frames): col_typ = pickle.loads(header["column"]["type-serialized"]) column = col_typ.deserialize(header["column"], frames[:column_nframes]) - return Series(column, index=index, name=name) + return cls._from_data({name: column}, index=index) def _get_columns_by_label(self, labels, downcast=False): """Return the column specified by `labels` @@ -1094,124 +1094,6 @@ def take(self, indices, keep_index=True): {self.name: self._column.take(col_inds, keep_index=False)} ) - def head(self, n=5): - """ - Return the first `n` rows. - This function returns the first `n` rows for the object based - on position. It is useful for quickly testing if your object - has the right type of data in it. - For negative values of `n`, this function returns all rows except - the last `n` rows, equivalent to ``df[:-n]``. - - Parameters - ---------- - n : int, default 5 - Number of rows to select. - - Returns - ------- - same type as caller - The first `n` rows of the caller object. - - See Also - -------- - Series.tail: Returns the last `n` rows. - - Examples - -------- - >>> ser = cudf.Series(['alligator', 'bee', 'falcon', - ... 'lion', 'monkey', 'parrot', 'shark', 'whale', 'zebra']) - >>> ser - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - 5 parrot - 6 shark - 7 whale - 8 zebra - dtype: object - - Viewing the first 5 lines - - >>> ser.head() - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - dtype: object - - Viewing the first `n` lines (three in this case) - - >>> ser.head(3) - 0 alligator - 1 bee - 2 falcon - dtype: object - - For negative values of `n` - - >>> ser.head(-3) - 0 alligator - 1 bee - 2 falcon - 3 lion - 4 monkey - 5 parrot - dtype: object - """ - return self.iloc[:n] - - def tail(self, n=5): - """ - Returns the last n rows as a new Series - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([4, 3, 2, 1, 0]) - >>> ser.tail(2) - 3 1 - 4 0 - """ - if n == 0: - return self.iloc[0:0] - - return self.iloc[-n:] - - def to_string(self): - """Convert to string - - Uses Pandas formatting internals to produce output identical to Pandas. - Use the Pandas formatting settings directly in Pandas to control cuDF - output. - - Returns - ------- - str - String representation of Series - - Examples - -------- - >>> import cudf - >>> series = cudf.Series(['a', None, 'b', 'c', None]) - >>> series - 0 a - 1 - 2 b - 3 c - 4 - dtype: object - >>> series.to_string() - '0 a\\n1 \\n2 b\\n3 c\\n4 \\ndtype: object' - """ # noqa : E501 - return self.__repr__() - - def __str__(self): - return self.to_string() - def __repr__(self): _, height = get_terminal_size() max_rows = ( @@ -2332,17 +2214,6 @@ def ge(self, other, fill_value=None, axis=0): other=other, fn="ge", fill_value=fill_value, can_reindex=True ) - def __invert__(self): - """Bitwise invert (~) for integral dtypes, logical NOT for bools.""" - if np.issubdtype(self.dtype, np.integer): - return self._unaryop("invert") - elif np.issubdtype(self.dtype, np.bool_): - return self._unaryop("not") - else: - raise TypeError( - f"Operation `~` not supported on {self.dtype.type.__name__}" - ) - @copy_docstring(CategoricalAccessor) # type: ignore @property def cat(self): @@ -2693,38 +2564,6 @@ def to_array(self, fillna=None): """ return self._column.to_array(fillna=fillna) - def nans_to_nulls(self): - """ - Convert nans (if any) to nulls - - Returns - ------- - Series - - Examples - -------- - >>> import cudf - >>> import numpy as np - >>> series = cudf.Series([1, 2, np.nan, None, 10], nan_as_null=False) - >>> series - 0 1.0 - 1 2.0 - 2 NaN - 3 - 4 10.0 - dtype: float64 - >>> series.nans_to_nulls() - 0 1.0 - 1 2.0 - 2 - 3 - 4 10.0 - dtype: float64 - """ - return self._from_data( - {self.name: self._column.nans_to_nulls()}, self._index - ) - def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): if bool_only not in (None, True): raise NotImplementedError( @@ -4931,39 +4770,6 @@ def groupby( self, by=by, level=level, dropna=dropna, sort=sort ) - @copy_docstring(Rolling) - def rolling( - self, window, min_periods=None, center=False, axis=0, win_type=None - ): - return Rolling( - self, - window, - min_periods=min_periods, - center=center, - axis=axis, - win_type=win_type, - ) - - @ioutils.doc_to_json() - def to_json(self, path_or_buf=None, *args, **kwargs): - """{docstring}""" - - return cudf.io.json.to_json( - self, path_or_buf=path_or_buf, *args, **kwargs - ) - - @ioutils.doc_to_hdf() - def to_hdf(self, path_or_buf, key, *args, **kwargs): - """{docstring}""" - - cudf.io.hdf.to_hdf(path_or_buf, key, self, *args, **kwargs) - - @ioutils.doc_to_dlpack() - def to_dlpack(self): - """{docstring}""" - - return cudf.io.dlpack.to_dlpack(self) - def rename(self, index=None, copy=True): """ Alter Series name