Skip to content

Commit

Permalink
Consolidate Several Series and Dataframe Methods (#9059)
Browse files Browse the repository at this point in the history
Partly addresses #9038 

This function consolidate several (trivial) functions from `Series` and `DataFrame` into Frame. `__invert__` was consolidated to shared (more efficient) code path using factory methods. 
`deserialize` was not consolidated because we have to provide backward compatibility to older classes. But factory method was used for faster class construction.

Authors:
  - Michael Wang (https://github.com/isVoid)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Sheilah Kirui (https://github.com/skirui-source)

URL: #9059
  • Loading branch information
isVoid authored Aug 30, 2021
1 parent 4945198 commit b2423ac
Show file tree
Hide file tree
Showing 3 changed files with 262 additions and 319 deletions.
123 changes: 5 additions & 118 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -35,7 +35,6 @@
from cudf.core.index import BaseIndex, RangeIndex, as_index
from cudf.core.indexing import _DataFrameIlocIndexer, _DataFrameLocIndexer
from cudf.core.series import Series
from cudf.core.window import Rolling
from cudf.utils import applyutils, docutils, ioutils, queryutils, utils
from cudf.utils.docutils import copy_docstring
from cudf.utils.dtypes import (
Expand Down Expand Up @@ -526,11 +525,12 @@ def serialize(self):

# Use the column directly to avoid duplicating the index
# need to pickle column names to handle numpy integer columns
header["column_names"] = pickle.dumps(tuple(self._data.names))
column_header, column_frames = column.serialize_columns(self._columns)
header["columns"] = column_header
header["columns"], column_frames = column.serialize_columns(
self._columns
)
frames.extend(column_frames)

header["column_names"] = pickle.dumps(tuple(self._data.names))
return header, frames

@classmethod
Expand All @@ -547,7 +547,7 @@ def deserialize(cls, header, frames):
column_names = pickle.loads(header["column_names"])
columns = column.deserialize_columns(header["columns"], column_frames)

return cls(dict(zip(column_names, columns)), index=index)
return cls._from_data(dict(zip(column_names, columns)), index=index,)

@property
def dtypes(self):
Expand Down Expand Up @@ -1029,68 +1029,6 @@ def assign(self, **kwargs):
new[k] = v
return new

def head(self, n=5):
"""
Returns the first n rows as a new DataFrame
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame()
>>> df['key'] = [0, 1, 2, 3, 4]
>>> df['val'] = [float(i + 10) for i in range(5)] # insert column
>>> df.head(2)
key val
0 0 10.0
1 1 11.0
"""
return self.iloc[:n]

def tail(self, n=5):
"""
Returns the last n rows as a new DataFrame
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame()
>>> df['key'] = [0, 1, 2, 3, 4]
>>> df['val'] = [float(i + 10) for i in range(5)] # insert column
>>> df.tail(2)
key val
3 3 13.0
4 4 14.0
"""
if n == 0:
return self.iloc[0:0]

return self.iloc[-n:]

def to_string(self):
"""
Convert to string
cuDF uses Pandas internals for efficient string formatting.
Set formatting options using pandas string formatting options and
cuDF objects will print identically to Pandas objects.
cuDF supports `null/None` as a value in any column type, which
is transparently supported during this output process.
Examples
--------
>>> import cudf
>>> df = cudf.DataFrame()
>>> df['key'] = [0, 1, 2]
>>> df['val'] = [float(i + 10) for i in range(3)]
>>> df.to_string()
' key val\\n0 0 10.0\\n1 1 11.0\\n2 2 12.0'
"""
return self.__repr__()

def __str__(self):
return self.to_string()

def astype(self, dtype, copy=False, errors="raise", **kwargs):
"""
Cast the DataFrame to the given dtype
Expand Down Expand Up @@ -1644,14 +1582,6 @@ def update(

self._mimic_inplace(source_df, inplace=True)

def __invert__(self):
# Defer logic to Series since pandas semantics dictate different
# behaviors for different types that requires too much special casing
# of the standard _unaryop.
return DataFrame(
data={col: ~self[col] for col in self}, index=self.index
)

def radd(self, other, axis=1, level=None, fill_value=None):
"""
Get Addition of dataframe and other, element-wise (binary
Expand Down Expand Up @@ -3505,15 +3435,6 @@ def rename(
else:
return out.copy(deep=copy)

def nans_to_nulls(self):
"""
Convert nans (if any) to nulls.
"""
df = self.copy()
for col in df.columns:
df[col] = df[col].nans_to_nulls()
return df

def as_gpu_matrix(self, columns=None, order="F"):
"""Convert to a matrix in device memory.
Expand Down Expand Up @@ -4506,19 +4427,6 @@ def groupby(
sort=sort,
)

@copy_docstring(Rolling)
def rolling(
self, window, min_periods=None, center=False, axis=0, win_type=None
):
return Rolling(
self,
window,
min_periods=min_periods,
center=center,
axis=axis,
win_type=win_type,
)

def query(self, expr, local_dict=None):
"""
Query with a boolean expression using Numba to compile a GPU kernel.
Expand Down Expand Up @@ -6732,27 +6640,6 @@ def to_feather(self, path, *args, **kwargs):

feather.to_feather(self, path, *args, **kwargs)

@ioutils.doc_to_json()
def to_json(self, path_or_buf=None, *args, **kwargs):
"""{docstring}"""
from cudf.io import json as json

return json.to_json(self, path_or_buf=path_or_buf, *args, **kwargs)

@ioutils.doc_to_hdf()
def to_hdf(self, path_or_buf, key, *args, **kwargs):
"""{docstring}"""
from cudf.io import hdf as hdf

hdf.to_hdf(path_or_buf, key, self, *args, **kwargs)

@ioutils.doc_to_dlpack()
def to_dlpack(self):
"""{docstring}"""
from cudf.io import dlpack as dlpack

return dlpack.to_dlpack(self)

@ioutils.doc_dataframe_to_csv()
def to_csv(
self,
Expand Down
Loading

0 comments on commit b2423ac

Please sign in to comment.