Skip to content

Commit

Permalink
Enable numpy ufuncs for DataFrame (#10287)
Browse files Browse the repository at this point in the history
This PR addresses the primary issue in #9083, enabling all numpy ufuncs for DataFrame objects. It builds on the work in #10217, generalizing that code path to support multiple columns and moving the method up to `IndexedFrame` to share the logic with `DataFrame`. The custom preprocessing of inputs before handing off to cupy that was implemented in #10217 has been replaced by reusing parts of the existing binop machinery for greater generality, which is especially important for DataFrame binops since they support a wider range of alternative operand types. The current internal refactor is intentionally minimal to leave the focus on the new ufunc features. I will make a follow-up to clean up the internal functions by adding a proper set of hooks into the binop and ufunc implementations so that we can share these implementations with Index types as well, at which point we will be able to remove the extraneous APIs discussed in #9083 (comment).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Ashwin Srinath (https://github.com/shwina)

URL: #10287
  • Loading branch information
vyasr authored Feb 15, 2022
1 parent 17b7907 commit 8b0737d
Show file tree
Hide file tree
Showing 5 changed files with 292 additions and 155 deletions.
33 changes: 22 additions & 11 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -1271,14 +1271,6 @@ def memory_usage(self, index=True, deep=False):
{str(k): v for k, v in super().memory_usage(index, deep).items()}
)

@annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python")
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
if method == "__call__" and hasattr(cudf, ufunc.__name__):
func = getattr(cudf, ufunc.__name__)
return func(self)
else:
return NotImplemented

@annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python")
def __array_function__(self, func, types, args, kwargs):

Expand Down Expand Up @@ -1864,8 +1856,7 @@ def _get_columns_by_label(self, labels, downcast=False):
)
return out

@annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python")
def _binaryop(
def _prep_for_binop(
self,
other: Any,
fn: str,
Expand All @@ -1885,6 +1876,7 @@ def _binaryop(
# implementation assumes that binary operations between a column and
# NULL are always commutative, even for binops (like subtraction) that
# are normally anticommutative.
# TODO: We probably should support pandas DataFrame/Series objects.
if isinstance(rhs, Sequence):
# TODO: Consider validating sequence length (pandas does).
operands = {
Expand Down Expand Up @@ -1948,11 +1940,30 @@ def _binaryop(
right = right_dict[col]
operands[col] = (left, right, reflect, fill_value)
else:
return NotImplemented, None

return operands, lhs._index

@annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python")
def _binaryop(
self,
other: Any,
fn: str,
fill_value: Any = None,
reflect: bool = False,
can_reindex: bool = False,
*args,
**kwargs,
):
operands, out_index = self._prep_for_binop(
other, fn, fill_value, reflect, can_reindex
)
if operands is NotImplemented:
return NotImplemented

return self._from_data(
ColumnAccessor(type(self)._colwise_binop(operands, fn)),
index=lhs._index,
index=out_index,
)

@annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python")
Expand Down
148 changes: 148 additions & 0 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1697,6 +1697,154 @@ def last(self, offset):
slice_func=lambda i: self.iloc[i:],
)

# For more detail on this function and how it should work, see
# https://numpy.org/doc/stable/reference/ufuncs.html
def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
# We don't currently support reduction, accumulation, etc. We also
# don't support any special kwargs or higher arity ufuncs than binary.
if method != "__call__" or kwargs or ufunc.nin > 2:
return NotImplemented

# Binary operations
binary_operations = {
# Arithmetic binary operations.
"add": "add",
"subtract": "sub",
"multiply": "mul",
"matmul": "matmul",
"divide": "truediv",
"true_divide": "truediv",
"floor_divide": "floordiv",
"power": "pow",
"float_power": "pow",
"remainder": "mod",
"mod": "mod",
"fmod": "mod",
# Bitwise binary operations.
"bitwise_and": "and",
"bitwise_or": "or",
"bitwise_xor": "xor",
# Comparison binary operators
"greater": "gt",
"greater_equal": "ge",
"less": "lt",
"less_equal": "le",
"not_equal": "ne",
"equal": "eq",
}

# First look for methods of the class.
fname = ufunc.__name__
if fname in binary_operations:
reflect = self is not inputs[0]
other = inputs[0] if reflect else inputs[1]

# These operators need to be mapped to their inverses when
# performing a reflected operation because no reflected version of
# the operators themselves exist.
ops_without_reflection = {
"gt": "lt",
"ge": "le",
"lt": "gt",
"le": "ge",
# ne and eq are symmetric, so they are their own inverse op
"ne": "ne",
"eq": "eq",
}

op = binary_operations[fname]
if reflect and op in ops_without_reflection:
op = ops_without_reflection[op]
reflect = False
op = f"__{'r' if reflect else ''}{op}__"

# pandas bitwise operations return bools if indexes are misaligned.
if (
"bitwise" in fname
and isinstance(other, IndexedFrame)
and not self.index.equals(other.index)
):
return getattr(self, op)(other).astype(bool)
# Float_power returns float irrespective of the input type.
if fname == "float_power":
return getattr(self, op)(other).astype(float)
return getattr(self, op)(other)

# Special handling for unary operations.
if fname == "negative":
return self * -1
if fname == "positive":
return self.copy(deep=True)
if fname == "invert":
return ~self
if fname == "absolute":
return self.abs()
if fname == "fabs":
return self.abs().astype(np.float64)

# Note: There are some operations that may be supported by libcudf but
# are not supported by pandas APIs. In particular, libcudf binary
# operations support logical and/or operations, but those operations
# are not defined on pd.Series/DataFrame. For now those operations will
# dispatch to cupy, but if ufuncs are ever a bottleneck we could add
# special handling to dispatch those (or any other) functions that we
# could implement without cupy.

# Attempt to dispatch all other functions to cupy.
cupy_func = getattr(cp, fname)
if cupy_func:
# Indices must be aligned before converting to arrays.
if ufunc.nin == 2:
other = inputs[self is inputs[0]]
inputs, index = self._prep_for_binop(other, fname)
else:
inputs = {
name: (col, None, False, None)
for name, col in self._data.items()
}
index = self._index

mask = None
data = [{} for _ in range(ufunc.nout)]
for name, (left, right, _, _) in inputs.items():
cupy_inputs = []
# TODO: I'm jumping through multiple hoops to get the unary
# behavior to match up with the binary. I should see if there
# are better patterns to employ here.
for inp in (left, right) if ufunc.nin == 2 else (left,):
if (
isinstance(inp, cudf.core.column.ColumnBase)
and inp.has_nulls()
):
new_mask = cudf.core.column.as_column(inp.nullmask)

# TODO: This is a hackish way to perform a bitwise and
# of bitmasks. Once we expose
# cudf::detail::bitwise_and, then we can use that
# instead.
mask = new_mask if mask is None else (mask & new_mask)

# Arbitrarily fill with zeros. For ufuncs, we assume
# that the end result propagates nulls via a bitwise
# and, so these elements are irrelevant.
inp = inp.fillna(0)
cupy_inputs.append(cp.asarray(inp))

cp_output = cupy_func(*cupy_inputs, **kwargs)
if ufunc.nout == 1:
cp_output = (cp_output,)
for i, out in enumerate(cp_output):
data[i][name] = cudf.core.column.as_column(out).set_mask(
mask
)

out = tuple(
self.__class__._from_data(out, index=index) for out in data
)
return out[0] if ufunc.nout == 1 else out

return NotImplemented


def _check_duplicate_level_names(specified, level_names):
"""Raise if any of `specified` has duplicates in `level_names`."""
Expand Down
Loading

0 comments on commit 8b0737d

Please sign in to comment.