Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Enable numpy ufuncs for DataFrame #10287

Merged
merged 10 commits into from
Feb 15, 2022
33 changes: 22 additions & 11 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,14 +1271,6 @@ def memory_usage(self, index=True, deep=False):
{str(k): v for k, v in super().memory_usage(index, deep).items()}
)

@annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python")
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
if method == "__call__" and hasattr(cudf, ufunc.__name__):
func = getattr(cudf, ufunc.__name__)
return func(self)
else:
return NotImplemented

@annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python")
def __array_function__(self, func, types, args, kwargs):

Expand Down Expand Up @@ -1864,8 +1856,7 @@ def _get_columns_by_label(self, labels, downcast=False):
)
return out

@annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python")
def _binaryop(
def _prep_for_binop(
shwina marked this conversation as resolved.
Show resolved Hide resolved
self,
other: Any,
fn: str,
Expand All @@ -1885,6 +1876,7 @@ def _binaryop(
# implementation assumes that binary operations between a column and
# NULL are always commutative, even for binops (like subtraction) that
# are normally anticommutative.
# TODO: We probably should support pandas DataFrame/Series objects.
if isinstance(rhs, Sequence):
# TODO: Consider validating sequence length (pandas does).
operands = {
Expand Down Expand Up @@ -1948,11 +1940,30 @@ def _binaryop(
right = right_dict[col]
operands[col] = (left, right, reflect, fill_value)
else:
return NotImplemented, None

return operands, lhs._index

@annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python")
def _binaryop(
self,
other: Any,
fn: str,
fill_value: Any = None,
reflect: bool = False,
can_reindex: bool = False,
*args,
**kwargs,
):
operands, out_index = self._prep_for_binop(
other, fn, fill_value, reflect, can_reindex
)
if operands is NotImplemented:
return NotImplemented

return self._from_data(
ColumnAccessor(type(self)._colwise_binop(operands, fn)),
index=lhs._index,
index=out_index,
)

@annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python")
Expand Down
148 changes: 148 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1697,6 +1697,154 @@ def last(self, offset):
slice_func=lambda i: self.iloc[i:],
)

# For more detail on this function and how it should work, see
# https://numpy.org/doc/stable/reference/ufuncs.html
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# We don't currently support reduction, accumulation, etc. We also
# don't support any special kwargs or higher arity ufuncs than binary.
if method != "__call__" or kwargs or ufunc.nin > 2:
return NotImplemented

# Binary operations
binary_operations = {
# Arithmetic binary operations.
"add": "add",
"subtract": "sub",
"multiply": "mul",
"matmul": "matmul",
"divide": "truediv",
"true_divide": "truediv",
"floor_divide": "floordiv",
"power": "pow",
"float_power": "pow",
"remainder": "mod",
"mod": "mod",
"fmod": "mod",
# Bitwise binary operations.
"bitwise_and": "and",
"bitwise_or": "or",
"bitwise_xor": "xor",
# Comparison binary operators
"greater": "gt",
"greater_equal": "ge",
"less": "lt",
"less_equal": "le",
"not_equal": "ne",
"equal": "eq",
}
shwina marked this conversation as resolved.
Show resolved Hide resolved

# First look for methods of the class.
fname = ufunc.__name__
if fname in binary_operations:
reflect = self is not inputs[0]
other = inputs[0] if reflect else inputs[1]

# These operators need to be mapped to their inverses when
# performing a reflected operation because no reflected version of
# the operators themselves exist.
ops_without_reflection = {
"gt": "lt",
"ge": "le",
"lt": "gt",
"le": "ge",
# ne and eq are symmetric, so they are their own inverse op
"ne": "ne",
"eq": "eq",
}

op = binary_operations[fname]
if reflect and op in ops_without_reflection:
op = ops_without_reflection[op]
reflect = False
op = f"__{'r' if reflect else ''}{op}__"

# pandas bitwise operations return bools if indexes are misaligned.
if (
"bitwise" in fname
and isinstance(other, IndexedFrame)
and not self.index.equals(other.index)
):
return getattr(self, op)(other).astype(bool)
# Float_power returns float irrespective of the input type.
if fname == "float_power":
return getattr(self, op)(other).astype(float)
return getattr(self, op)(other)

# Special handling for unary operations.
if fname == "negative":
return self * -1
if fname == "positive":
return self.copy(deep=True)
if fname == "invert":
return ~self
if fname == "absolute":
return self.abs()
if fname == "fabs":
return self.abs().astype(np.float64)

# Note: There are some operations that may be supported by libcudf but
# are not supported by pandas APIs. In particular, libcudf binary
# operations support logical and/or operations, but those operations
# are not defined on pd.Series/DataFrame. For now those operations will
# dispatch to cupy, but if ufuncs are ever a bottleneck we could add
# special handling to dispatch those (or any other) functions that we
# could implement without cupy.

# Attempt to dispatch all other functions to cupy.
cupy_func = getattr(cp, fname)
if cupy_func:
# Indices must be aligned before converting to arrays.
if ufunc.nin == 2:
other = inputs[self is inputs[0]]
vyasr marked this conversation as resolved.
Show resolved Hide resolved
inputs, index = self._prep_for_binop(other, fname)
else:
inputs = {
name: (col, None, False, None)
for name, col in self._data.items()
}
index = self._index

mask = None
data = [{} for _ in range(ufunc.nout)]
for name, (left, right, _, _) in inputs.items():
cupy_inputs = []
# TODO: I'm jumping through multiple hoops to get the unary
# behavior to match up with the binary. I should see if there
# are better patterns to employ here.
vyasr marked this conversation as resolved.
Show resolved Hide resolved
for inp in (left, right) if ufunc.nin == 2 else (left,):
if (
isinstance(inp, cudf.core.column.ColumnBase)
and inp.has_nulls()
):
new_mask = cudf.core.column.as_column(inp.nullmask)

# TODO: This is a hackish way to perform a bitwise and
# of bitmasks. Once we expose
# cudf::detail::bitwise_and, then we can use that
# instead.
mask = new_mask if mask is None else (mask & new_mask)

# Arbitrarily fill with zeros. For ufuncs, we assume
# that the end result propagates nulls via a bitwise
# and, so these elements are irrelevant.
inp = inp.fillna(0)
cupy_inputs.append(cp.asarray(inp))

cp_output = cupy_func(*cupy_inputs, **kwargs)
if ufunc.nout == 1:
cp_output = (cp_output,)
for i, out in enumerate(cp_output):
data[i][name] = cudf.core.column.as_column(out).set_mask(
mask
)

out = tuple(
self.__class__._from_data(out, index=index) for out in data
)
return out[0] if ufunc.nout == 1 else out

return NotImplemented


def _check_duplicate_level_names(specified, level_names):
"""Raise if any of `specified` has duplicates in `level_names`."""
Expand Down
Loading