Enable numpy ufuncs for DataFrame (#10287)

This PR addresses the primary issue in #9083, enabling all numpy ufuncs for DataFrame objects. It builds on the work in #10217, generalizing that code path to support multiple columns and moving the method up to `IndexedFrame` to share the logic with `DataFrame`. The custom preprocessing of inputs before handing off to cupy that was implemented in #10217 has been replaced by reusing parts of the existing binop machinery for greater generality, which is especially important for DataFrame binops since they support a wider range of alternative operand types. The current internal refactor is intentionally minimal to leave the focus on the new ufunc features. I will make a follow-up to clean up the internal functions by adding a proper set of hooks into the binop and ufunc implementations so that we can share these implementations with Index types as well, at which point we will be able to remove the extraneous APIs discussed in #9083 (comment). Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ashwin Srinath (https://github.com/shwina) URL: #10287
rapidsai · Feb 15, 2022 · 8b0737d · 8b0737d
1 parent 17b7907
commit 8b0737d
Show file tree

Hide file tree

Showing 5 changed files with 292 additions and 155 deletions.
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
@@ -1271,14 +1271,6 @@ def memory_usage(self, index=True, deep=False):
             {str(k): v for k, v in super().memory_usage(index, deep).items()}
         )
 
-    @annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python")
-    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
-        if method == "__call__" and hasattr(cudf, ufunc.__name__):
-            func = getattr(cudf, ufunc.__name__)
-            return func(self)
-        else:
-            return NotImplemented
-
     @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python")
     def __array_function__(self, func, types, args, kwargs):
 
@@ -1864,8 +1856,7 @@ def _get_columns_by_label(self, labels, downcast=False):
         )
         return out
 
-    @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python")
-    def _binaryop(
+    def _prep_for_binop(
         self,
         other: Any,
         fn: str,
@@ -1885,6 +1876,7 @@ def _binaryop(
         # implementation assumes that binary operations between a column and
         # NULL are always commutative, even for binops (like subtraction) that
         # are normally anticommutative.
+        # TODO: We probably should support pandas DataFrame/Series objects.
         if isinstance(rhs, Sequence):
             # TODO: Consider validating sequence length (pandas does).
             operands = {
@@ -1948,11 +1940,30 @@ def _binaryop(
                     right = right_dict[col]
                 operands[col] = (left, right, reflect, fill_value)
         else:
+            return NotImplemented, None
+
+        return operands, lhs._index
+
+    @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python")
+    def _binaryop(
+        self,
+        other: Any,
+        fn: str,
+        fill_value: Any = None,
+        reflect: bool = False,
+        can_reindex: bool = False,
+        *args,
+        **kwargs,
+    ):
+        operands, out_index = self._prep_for_binop(
+            other, fn, fill_value, reflect, can_reindex
+        )
+        if operands is NotImplemented:
             return NotImplemented
 
         return self._from_data(
             ColumnAccessor(type(self)._colwise_binop(operands, fn)),
-            index=lhs._index,
+            index=out_index,
         )
 
     @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python")

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
@@ -1697,6 +1697,154 @@ def last(self, offset):
             slice_func=lambda i: self.iloc[i:],
         )
 
+    # For more detail on this function and how it should work, see
+    # https://numpy.org/doc/stable/reference/ufuncs.html
+    def __array_ufunc__(self, ufunc, method, *inputs, **kwargs):
+        # We don't currently support reduction, accumulation, etc. We also
+        # don't support any special kwargs or higher arity ufuncs than binary.
+        if method != "__call__" or kwargs or ufunc.nin > 2:
+            return NotImplemented
+
+        # Binary operations
+        binary_operations = {
+            # Arithmetic binary operations.
+            "add": "add",
+            "subtract": "sub",
+            "multiply": "mul",
+            "matmul": "matmul",
+            "divide": "truediv",
+            "true_divide": "truediv",
+            "floor_divide": "floordiv",
+            "power": "pow",
+            "float_power": "pow",
+            "remainder": "mod",
+            "mod": "mod",
+            "fmod": "mod",
+            # Bitwise binary operations.
+            "bitwise_and": "and",
+            "bitwise_or": "or",
+            "bitwise_xor": "xor",
+            # Comparison binary operators
+            "greater": "gt",
+            "greater_equal": "ge",
+            "less": "lt",
+            "less_equal": "le",
+            "not_equal": "ne",
+            "equal": "eq",
+        }
+
+        # First look for methods of the class.
+        fname = ufunc.__name__
+        if fname in binary_operations:
+            reflect = self is not inputs[0]
+            other = inputs[0] if reflect else inputs[1]
+
+            # These operators need to be mapped to their inverses when
+            # performing a reflected operation because no reflected version of
+            # the operators themselves exist.
+            ops_without_reflection = {
+                "gt": "lt",
+                "ge": "le",
+                "lt": "gt",
+                "le": "ge",
+                # ne and eq are symmetric, so they are their own inverse op
+                "ne": "ne",
+                "eq": "eq",
+            }
+
+            op = binary_operations[fname]
+            if reflect and op in ops_without_reflection:
+                op = ops_without_reflection[op]
+                reflect = False
+            op = f"__{'r' if reflect else ''}{op}__"
+
+            # pandas bitwise operations return bools if indexes are misaligned.
+            if (
+                "bitwise" in fname
+                and isinstance(other, IndexedFrame)
+                and not self.index.equals(other.index)
+            ):
+                return getattr(self, op)(other).astype(bool)
+            # Float_power returns float irrespective of the input type.
+            if fname == "float_power":
+                return getattr(self, op)(other).astype(float)
+            return getattr(self, op)(other)
+
+        # Special handling for unary operations.
+        if fname == "negative":
+            return self * -1
+        if fname == "positive":
+            return self.copy(deep=True)
+        if fname == "invert":
+            return ~self
+        if fname == "absolute":
+            return self.abs()
+        if fname == "fabs":
+            return self.abs().astype(np.float64)
+
+        # Note: There are some operations that may be supported by libcudf but
+        # are not supported by pandas APIs. In particular, libcudf binary
+        # operations support logical and/or operations, but those operations
+        # are not defined on pd.Series/DataFrame. For now those operations will
+        # dispatch to cupy, but if ufuncs are ever a bottleneck we could add
+        # special handling to dispatch those (or any other) functions that we
+        # could implement without cupy.
+
+        # Attempt to dispatch all other functions to cupy.
+        cupy_func = getattr(cp, fname)
+        if cupy_func:
+            # Indices must be aligned before converting to arrays.
+            if ufunc.nin == 2:
+                other = inputs[self is inputs[0]]
+                inputs, index = self._prep_for_binop(other, fname)
+            else:
+                inputs = {
+                    name: (col, None, False, None)
+                    for name, col in self._data.items()
+                }
+                index = self._index
+
+            mask = None
+            data = [{} for _ in range(ufunc.nout)]
+            for name, (left, right, _, _) in inputs.items():
+                cupy_inputs = []
+                # TODO: I'm jumping through multiple hoops to get the unary
+                # behavior to match up with the binary. I should see if there
+                # are better patterns to employ here.
+                for inp in (left, right) if ufunc.nin == 2 else (left,):
+                    if (
+                        isinstance(inp, cudf.core.column.ColumnBase)
+                        and inp.has_nulls()
+                    ):
+                        new_mask = cudf.core.column.as_column(inp.nullmask)
+
+                        # TODO: This is a hackish way to perform a bitwise and
+                        # of bitmasks. Once we expose
+                        # cudf::detail::bitwise_and, then we can use that
+                        # instead.
+                        mask = new_mask if mask is None else (mask & new_mask)
+
+                        # Arbitrarily fill with zeros. For ufuncs, we assume
+                        # that the end result propagates nulls via a bitwise
+                        # and, so these elements are irrelevant.
+                        inp = inp.fillna(0)
+                    cupy_inputs.append(cp.asarray(inp))
+
+                cp_output = cupy_func(*cupy_inputs, **kwargs)
+                if ufunc.nout == 1:
+                    cp_output = (cp_output,)
+                for i, out in enumerate(cp_output):
+                    data[i][name] = cudf.core.column.as_column(out).set_mask(
+                        mask
+                    )
+
+            out = tuple(
+                self.__class__._from_data(out, index=index) for out in data
+            )
+            return out[0] if ufunc.nout == 1 else out
+
+        return NotImplemented
+
 
 def _check_duplicate_level_names(specified, level_names):
     """Raise if any of `specified` has duplicates in `level_names`."""