diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index 371404ca477..aab3b989047 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -1271,14 +1271,6 @@ def memory_usage(self, index=True, deep=False): {str(k): v for k, v in super().memory_usage(index, deep).items()} ) - @annotate("DATAFRAME_ARRAY_UFUNC", color="blue", domain="cudf_python") - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - if method == "__call__" and hasattr(cudf, ufunc.__name__): - func = getattr(cudf, ufunc.__name__) - return func(self) - else: - return NotImplemented - @annotate("DATAFRAME_ARRAY_FUNCTION", color="blue", domain="cudf_python") def __array_function__(self, func, types, args, kwargs): @@ -1864,8 +1856,7 @@ def _get_columns_by_label(self, labels, downcast=False): ) return out - @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") - def _binaryop( + def _prep_for_binop( self, other: Any, fn: str, @@ -1885,6 +1876,7 @@ def _binaryop( # implementation assumes that binary operations between a column and # NULL are always commutative, even for binops (like subtraction) that # are normally anticommutative. + # TODO: We probably should support pandas DataFrame/Series objects. if isinstance(rhs, Sequence): # TODO: Consider validating sequence length (pandas does). operands = { @@ -1948,11 +1940,30 @@ def _binaryop( right = right_dict[col] operands[col] = (left, right, reflect, fill_value) else: + return NotImplemented, None + + return operands, lhs._index + + @annotate("DATAFRAME_BINARYOP", color="blue", domain="cudf_python") + def _binaryop( + self, + other: Any, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) + if operands is NotImplemented: return NotImplemented return self._from_data( ColumnAccessor(type(self)._colwise_binop(operands, fn)), - index=lhs._index, + index=out_index, ) @annotate("DATAFRAME_UPDATE", color="blue", domain="cudf_python") diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 7788a5346c8..e1ff3984948 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -1697,6 +1697,154 @@ def last(self, offset): slice_func=lambda i: self.iloc[i:], ) + # For more detail on this function and how it should work, see + # https://numpy.org/doc/stable/reference/ufuncs.html + def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): + # We don't currently support reduction, accumulation, etc. We also + # don't support any special kwargs or higher arity ufuncs than binary. + if method != "__call__" or kwargs or ufunc.nin > 2: + return NotImplemented + + # Binary operations + binary_operations = { + # Arithmetic binary operations. + "add": "add", + "subtract": "sub", + "multiply": "mul", + "matmul": "matmul", + "divide": "truediv", + "true_divide": "truediv", + "floor_divide": "floordiv", + "power": "pow", + "float_power": "pow", + "remainder": "mod", + "mod": "mod", + "fmod": "mod", + # Bitwise binary operations. + "bitwise_and": "and", + "bitwise_or": "or", + "bitwise_xor": "xor", + # Comparison binary operators + "greater": "gt", + "greater_equal": "ge", + "less": "lt", + "less_equal": "le", + "not_equal": "ne", + "equal": "eq", + } + + # First look for methods of the class. + fname = ufunc.__name__ + if fname in binary_operations: + reflect = self is not inputs[0] + other = inputs[0] if reflect else inputs[1] + + # These operators need to be mapped to their inverses when + # performing a reflected operation because no reflected version of + # the operators themselves exist. + ops_without_reflection = { + "gt": "lt", + "ge": "le", + "lt": "gt", + "le": "ge", + # ne and eq are symmetric, so they are their own inverse op + "ne": "ne", + "eq": "eq", + } + + op = binary_operations[fname] + if reflect and op in ops_without_reflection: + op = ops_without_reflection[op] + reflect = False + op = f"__{'r' if reflect else ''}{op}__" + + # pandas bitwise operations return bools if indexes are misaligned. + if ( + "bitwise" in fname + and isinstance(other, IndexedFrame) + and not self.index.equals(other.index) + ): + return getattr(self, op)(other).astype(bool) + # Float_power returns float irrespective of the input type. + if fname == "float_power": + return getattr(self, op)(other).astype(float) + return getattr(self, op)(other) + + # Special handling for unary operations. + if fname == "negative": + return self * -1 + if fname == "positive": + return self.copy(deep=True) + if fname == "invert": + return ~self + if fname == "absolute": + return self.abs() + if fname == "fabs": + return self.abs().astype(np.float64) + + # Note: There are some operations that may be supported by libcudf but + # are not supported by pandas APIs. In particular, libcudf binary + # operations support logical and/or operations, but those operations + # are not defined on pd.Series/DataFrame. For now those operations will + # dispatch to cupy, but if ufuncs are ever a bottleneck we could add + # special handling to dispatch those (or any other) functions that we + # could implement without cupy. + + # Attempt to dispatch all other functions to cupy. + cupy_func = getattr(cp, fname) + if cupy_func: + # Indices must be aligned before converting to arrays. + if ufunc.nin == 2: + other = inputs[self is inputs[0]] + inputs, index = self._prep_for_binop(other, fname) + else: + inputs = { + name: (col, None, False, None) + for name, col in self._data.items() + } + index = self._index + + mask = None + data = [{} for _ in range(ufunc.nout)] + for name, (left, right, _, _) in inputs.items(): + cupy_inputs = [] + # TODO: I'm jumping through multiple hoops to get the unary + # behavior to match up with the binary. I should see if there + # are better patterns to employ here. + for inp in (left, right) if ufunc.nin == 2 else (left,): + if ( + isinstance(inp, cudf.core.column.ColumnBase) + and inp.has_nulls() + ): + new_mask = cudf.core.column.as_column(inp.nullmask) + + # TODO: This is a hackish way to perform a bitwise and + # of bitmasks. Once we expose + # cudf::detail::bitwise_and, then we can use that + # instead. + mask = new_mask if mask is None else (mask & new_mask) + + # Arbitrarily fill with zeros. For ufuncs, we assume + # that the end result propagates nulls via a bitwise + # and, so these elements are irrelevant. + inp = inp.fillna(0) + cupy_inputs.append(cp.asarray(inp)) + + cp_output = cupy_func(*cupy_inputs, **kwargs) + if ufunc.nout == 1: + cp_output = (cp_output,) + for i, out in enumerate(cp_output): + data[i][name] = cudf.core.column.as_column(out).set_mask( + mask + ) + + out = tuple( + self.__class__._from_data(out, index=index) for out in data + ) + return out[0] if ufunc.nout == 1 else out + + return NotImplemented + def _check_duplicate_level_names(specified, level_names): """Raise if any of `specified` has duplicates in `level_names`.""" diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 90ebeba5087..3aef4447a28 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -7,7 +7,6 @@ import pickle import warnings from collections import abc as abc -from itertools import repeat from numbers import Number from shutil import get_terminal_size from typing import Any, MutableMapping, Optional, Set, Union @@ -959,141 +958,6 @@ def to_frame(self, name=None): def memory_usage(self, index=True, deep=False): return sum(super().memory_usage(index, deep).values()) - # For more detail on this function and how it should work, see - # https://numpy.org/doc/stable/reference/ufuncs.html - def __array_ufunc__(self, ufunc, method, *inputs, **kwargs): - # We don't currently support reduction, accumulation, etc. We also - # don't support any special kwargs or higher arity ufuncs than binary. - if method != "__call__" or kwargs or ufunc.nin > 2: - return NotImplemented - - # Binary operations - binary_operations = { - # Arithmetic binary operations. - "add": "add", - "subtract": "sub", - "multiply": "mul", - "matmul": "matmul", - "divide": "truediv", - "true_divide": "truediv", - "floor_divide": "floordiv", - "power": "pow", - "float_power": "pow", - "remainder": "mod", - "mod": "mod", - "fmod": "mod", - # Bitwise binary operations. - "bitwise_and": "and", - "bitwise_or": "or", - "bitwise_xor": "xor", - # Comparison binary operators - "greater": "gt", - "greater_equal": "ge", - "less": "lt", - "less_equal": "le", - "not_equal": "ne", - "equal": "eq", - } - - # First look for methods of the class. - fname = ufunc.__name__ - if fname in binary_operations: - reflect = self is not inputs[0] - other = inputs[0] if reflect else inputs[1] - - # These operators need to be mapped to their inverses when - # performing a reflected operation because no reflected version of - # the operators themselves exist. - ops_without_reflection = { - "gt": "lt", - "ge": "le", - "lt": "gt", - "le": "ge", - # ne and eq are symmetric, so they are their own inverse op - "ne": "ne", - "eq": "eq", - } - - op = binary_operations[fname] - if reflect and op in ops_without_reflection: - op = ops_without_reflection[op] - reflect = False - op = f"__{'r' if reflect else ''}{op}__" - - # pandas bitwise operations return bools if indexes are misaligned. - # TODO: Generalize for other types of Frames - if ( - "bitwise" in fname - and isinstance(other, Series) - and not self.index.equals(other.index) - ): - return getattr(self, op)(other).astype(bool) - # Float_power returns float irrespective of the input type. - if fname == "float_power": - return getattr(self, op)(other).astype(float) - return getattr(self, op)(other) - - # Special handling for unary operations. - if fname == "negative": - return self * -1 - if fname == "positive": - return self.copy(deep=True) - if fname == "invert": - return ~self - if fname == "absolute": - return self.abs() - if fname == "fabs": - return self.abs().astype(np.float64) - - # Note: There are some operations that may be supported by libcudf but - # are not supported by pandas APIs. In particular, libcudf binary - # operations support logical and/or operations, but those operations - # are not defined on pd.Series/DataFrame. For now those operations will - # dispatch to cupy, but if ufuncs are ever a bottleneck we could add - # special handling to dispatch those (or any other) functions that we - # could implement without cupy. - - # Attempt to dispatch all other functions to cupy. - cupy_func = getattr(cupy, fname) - if cupy_func: - # Indices must be aligned before converting to arrays. - if ufunc.nin == 2 and all(map(isinstance, inputs, repeat(Series))): - inputs = _align_indices(inputs, allow_non_unique=True) - index = inputs[0].index - else: - index = self.index - - cupy_inputs = [] - mask = None - for inp in inputs: - # TODO: Generalize for other types of Frames - if isinstance(inp, Series) and inp.has_nulls: - new_mask = as_column(inp.nullmask) - - # TODO: This is a hackish way to perform a bitwise and of - # bitmasks. Once we expose cudf::detail::bitwise_and, then - # we can use that instead. - mask = new_mask if mask is None else (mask & new_mask) - - # Arbitrarily fill with zeros. For ufuncs, we assume that - # the end result propagates nulls via a bitwise and, so - # these elements are irrelevant. - inp = inp.fillna(0) - cupy_inputs.append(cupy.asarray(inp)) - - cp_output = cupy_func(*cupy_inputs, **kwargs) - - def make_frame(arr): - return self.__class__._from_data( - {self.name: as_column(arr).set_mask(mask)}, index=index - ) - - if ufunc.nout > 1: - return tuple(make_frame(out) for out in cp_output) - return make_frame(cp_output) - - return NotImplemented - def __array_function__(self, func, types, args, kwargs): handled_types = [cudf.Series] for t in types: @@ -1342,9 +1206,9 @@ def __repr__(self): lines.append(category_memory) return "\n".join(lines) - def _binaryop( + def _prep_for_binop( self, - other: Frame, + other: Any, fn: str, fill_value: Any = None, reflect: bool = False, @@ -1376,9 +1240,24 @@ def _binaryop( lhs = self operands = lhs._make_operands_for_binop(other, fill_value, reflect) + return operands, lhs._index + + def _binaryop( + self, + other: Frame, + fn: str, + fill_value: Any = None, + reflect: bool = False, + can_reindex: bool = False, + *args, + **kwargs, + ): + operands, out_index = self._prep_for_binop( + other, fn, fill_value, reflect, can_reindex + ) return ( - lhs._from_data( - data=lhs._colwise_binop(operands, fn), index=lhs._index, + self._from_data( + data=self._colwise_binop(operands, fn), index=out_index, ) if operands is not NotImplemented else NotImplemented diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index bf867923b57..50b206d3388 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. """Base class for Frame types that only have a single column.""" from __future__ import annotations @@ -274,7 +274,7 @@ def factorize(self, na_sentinel=-1): def _make_operands_for_binop( self, - other: T, + other: Any, fill_value: Any = None, reflect: bool = False, *args, @@ -310,7 +310,7 @@ def _make_operands_for_binop( else: result_name = self.name - # This needs to be tested correctly + # TODO: This needs to be tested correctly if isinstance(other, SingleColumnFrame): other = other._column elif not _is_scalar_or_zero_d_array(other): diff --git a/python/cudf/cudf/tests/test_array_ufunc.py b/python/cudf/cudf/tests/test_array_ufunc.py index a384ddecca6..f1aad1af9e6 100644 --- a/python/cudf/cudf/tests/test_array_ufunc.py +++ b/python/cudf/cudf/tests/test_array_ufunc.py @@ -109,7 +109,7 @@ def test_ufunc_series(ufunc, has_nulls, indexed): @pytest.mark.parametrize("reflect", [True, False]) def test_binary_ufunc_series_array(ufunc, has_nulls, indexed, type_, reflect): fname = ufunc.__name__ - if fname in ("greater", "greater_equal") and has_nulls: + if fname in ("greater", "greater_equal", "logical_and") and has_nulls: pytest.xfail( "The way cudf casts nans in arrays to nulls during binops with " "cudf objects is currently incompatible with pandas." @@ -181,3 +181,102 @@ def test_ufunc_cudf_series_error_with_out_kwarg(func): # this throws a value-error because of presence of out kwarg with pytest.raises(TypeError): func(x1=cudf_s1, x2=cudf_s2, out=cudf_s3) + + +# Skip matmul since it requires aligned shapes. +@pytest.mark.parametrize("ufunc", (uf for uf in _UFUNCS if uf != np.matmul)) +@pytest.mark.parametrize("has_nulls", [True, False]) +@pytest.mark.parametrize("indexed", [True, False]) +def test_ufunc_dataframe(ufunc, has_nulls, indexed): + # Note: This test assumes that all ufuncs are unary or binary. + fname = ufunc.__name__ + # TODO: When pandas starts supporting misaligned indexes properly, remove + # this check but enable the one below. + if indexed: + pytest.xfail( + "pandas does not currently support misaligned indexes in " + "DataFrames, but we do. Until this is fixed we will skip these " + "tests. See the error here: " + "https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L212, " # noqa: E501 + "called from https://github.com/pandas-dev/pandas/blob/main/pandas/core/arraylike.py#L258" # noqa: E501 + ) + # TODO: Enable the check below when we remove the check above. + # if indexed and fname in ( + # "greater", + # "greater_equal", + # "less", + # "less_equal", + # "not_equal", + # "equal", + # ): + # pytest.skip("Comparison operators do not support misaligned indexes.") # noqa: E501 + + N = 100 + # Avoid zeros in either array to skip division by 0 errors. Also limit the + # scale to avoid issues with overflow, etc. We use ints because some + # operations (like bitwise ops) are not defined for floats. + # TODO: Add tests of mismatched columns etc. + pandas_args = args = [ + cudf.DataFrame( + {"foo": cp.random.randint(low=1, high=10, size=N)}, + index=cp.random.choice(range(N), N, False) if indexed else None, + ) + for _ in range(ufunc.nin) + ] + + if has_nulls: + # Converting nullable integer cudf.Series to pandas will produce a + # float pd.Series, so instead we replace nulls with an arbitrary + # integer value, precompute the mask, and then reapply it afterwards. + for arg in args: + set_random_null_mask_inplace(arg["foo"]) + pandas_args = [arg.copy() for arg in args] + for arg in pandas_args: + arg["foo"] = arg["foo"].fillna(0) + + # Note: Different indexes must be aligned before the mask is computed. + # This requires using an internal function (_align_indices), and that + # is unlikely to change for the foreseeable future. + aligned = ( + cudf.core.dataframe._align_indices(*args) + if indexed and ufunc.nin == 2 + else args + ) + mask = reduce( + operator.or_, (a["foo"].isna() for a in aligned) + ).to_pandas() + + try: + got = ufunc(*args) + except AttributeError as e: + # We xfail if we don't have an explicit dispatch and cupy doesn't have + # the method so that we can easily identify these methods. As of this + # writing, the only missing methods are isnat and heaviside. + if "module 'cupy' has no attribute" in str(e): + pytest.xfail(reason="Operation not supported by cupy") + raise + + expect = ufunc(*(arg.to_pandas() for arg in pandas_args)) + + try: + if ufunc.nout > 1: + for g, e in zip(got, expect): + if has_nulls: + e[mask] = np.nan + assert_eq(g, e) + else: + if has_nulls: + expect[mask] = np.nan + assert_eq(got, expect) + except AssertionError: + # TODO: This branch can be removed when + # https://github.com/rapidsai/cudf/issues/10178 is resolved + if fname in ("power", "float_power"): + not_equal = cudf.from_pandas(expect) != got + not_equal[got.isna()] = False + diffs = got[not_equal] - cudf.from_pandas( + expect[not_equal.to_pandas()] + ) + if diffs["foo"].abs().max() == 1: + pytest.xfail("https://github.com/rapidsai/cudf/issues/10178") + raise