Fix binary operations between Series and Index (#13842)

Fixes: #13836 This PR fixes binary operations between `Series` & `Index` (and vice-versa) by properly handling the return type and the names being assigned to the return results. This PR also adds an early exit that will return `False` for `eq` operations when the other column being compared is not of the same type. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Matthew Roeschke (https://github.com/mroeschke) URL: #13842
rapidsai · Aug 10, 2023 · b743cc7 · b743cc7
1 parent 2801a27
commit b743cc7
Show file tree

Hide file tree

Showing 9 changed files with 174 additions and 19 deletions.
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
@@ -31,6 +31,7 @@
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.utils import ioutils
 from cudf.utils.dtypes import is_mixed_with_object_dtype
+from cudf.utils.utils import _is_same_name
 
 
 class BaseIndex(Serializable):
@@ -2010,7 +2011,4 @@ def _split(self, splits):
 
 
 def _get_result_name(left_name, right_name):
-    if left_name == right_name:
-        return left_name
-    else:
-        return None
+    return left_name if _is_same_name(left_name, right_name) else None
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
@@ -31,7 +31,7 @@
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.dtypes import _get_base_dtype
-from cudf.utils.utils import _fillna_natwise
+from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise
 
 _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format
 
@@ -424,12 +424,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         )
         lhs, rhs = (other, self) if reflect else (self, other)
         out_dtype = None
-        if op in {
-            "__eq__",
-            "NULL_EQUALS",
-        }:
-            out_dtype = cudf.dtype(np.bool_)
-        elif (
+
+        if (
             op
             in {
                 "__ne__",
@@ -455,6 +451,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
             # well-defined if this operation was not invoked via reflection.
             elif other_is_timedelta and not reflect:
                 out_dtype = _resolve_mixed_dtypes(lhs, rhs, "datetime64")
+        elif op in {
+            "__eq__",
+            "NULL_EQUALS",
+            "__ne__",
+        }:
+            out_dtype = cudf.dtype(np.bool_)
+            if isinstance(other, ColumnBase) and not isinstance(
+                other, DatetimeColumn
+            ):
+                return _all_bools_with_nulls(
+                    self, other, bool_fill_value=op == "__ne__"
+                )
 
         if out_dtype is None:
             return NotImplemented

diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
@@ -16,7 +16,7 @@
 from cudf.core.buffer import Buffer, acquire_spill_lock
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
-from cudf.utils.utils import _fillna_natwise
+from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise
 
 _dtype_to_format_conversion = {
     "timedelta64[ns]": "%D days %H:%M:%S",
@@ -202,6 +202,13 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
         elif other.dtype.kind in {"f", "i", "u"}:
             if op in {"__mul__", "__mod__", "__truediv__", "__floordiv__"}:
                 out_dtype = self.dtype
+            elif op in {"__eq__", "NULL_EQUALS", "__ne__"}:
+                if isinstance(other, ColumnBase) and not isinstance(
+                    other, TimeDeltaColumn
+                ):
+                    return _all_bools_with_nulls(
+                        self, other, bool_fill_value=op == "__ne__"
+                    )
 
         if out_dtype is None:
             return NotImplemented

diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
@@ -65,6 +65,7 @@
 )
 from cudf.utils.utils import (
     _cudf_nvtx_annotate,
+    _is_same_name,
     _warn_no_dask_cudf,
     search_range,
 )
@@ -1029,12 +1030,27 @@ def _binaryop(
         operands = self._make_operands_for_binop(other, fill_value, reflect)
         if operands is NotImplemented:
             return NotImplemented
-        ret = _index_from_data(self._colwise_binop(operands, op))
+        binop_result = self._colwise_binop(operands, op)
+
+        if isinstance(other, cudf.Series):
+            ret = other._from_data_like_self(binop_result)
+            ret.name = (
+                self.name
+                if cudf.utils.utils._is_same_name(self.name, other.name)
+                else None
+            )
+        else:
+            ret = _index_from_data(binop_result)
 
         # pandas returns numpy arrays when the outputs are boolean. We
         # explicitly _do not_ use isinstance here: we want only boolean
         # GenericIndexes, not dtype-specific subclasses.
-        if type(ret) is GenericIndex and ret.dtype.kind == "b":
+        if (
+            isinstance(ret, (GenericIndex, cudf.Series))
+            and ret.dtype.kind == "b"
+        ):
+            if ret._column.has_nulls():
+                ret = ret.fillna(op == "__ne__")
             return ret.values
         return ret
 
@@ -3309,7 +3325,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
     if isinstance(arbitrary, cudf.MultiIndex):
         return arbitrary
     elif isinstance(arbitrary, BaseIndex):
-        if arbitrary.name == kwargs["name"]:
+        if _is_same_name(arbitrary.name, kwargs["name"]):
             return arbitrary
         idx = arbitrary.copy(deep=False)
         idx.rename(kwargs["name"], inplace=True)

diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py
@@ -25,7 +25,7 @@
 from cudf.core._compat import PANDAS_GE_150
 from cudf.core.frame import Frame
 from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
-from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
+from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate, _is_same_name
 
 
 def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
@@ -1891,7 +1891,7 @@ def _maybe_match_names(self, other):
         if len(self.names) != len(other.names):
             return [None] * len(self.names)
         return [
-            self_name if self_name == other_name else None
+            self_name if _is_same_name(self_name, other_name) else None
             for self_name, other_name in zip(self.names, other.names)
         ]
 

diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
@@ -1505,9 +1505,14 @@ def _make_operands_and_index_for_binop(
                     "Can only compare identically-labeled Series objects"
                 )
             lhs, other = _align_indices([self, other], allow_non_unique=True)
-            can_use_self_column_name = self.name == other.name
         else:
             lhs = self
+
+        try:
+            can_use_self_column_name = cudf.utils.utils._is_same_name(
+                self.name, other.name
+            )
+        except AttributeError:
             can_use_self_column_name = False
 
         operands = lhs._make_operands_for_binop(other, fill_value, reflect)

diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py
@@ -327,7 +327,9 @@ def _make_operands_for_binop(
         # Get the appropriate name for output operations involving two objects
         # that are Series-like objects. The output shares the lhs's name unless
         # the rhs is a _differently_ named Series-like object.
-        if isinstance(other, SingleColumnFrame) and self.name != other.name:
+        if isinstance(
+            other, SingleColumnFrame
+        ) and not cudf.utils.utils._is_same_name(self.name, other.name):
             result_name = None
         else:
             result_name = self.name

diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
@@ -3,6 +3,7 @@
 import decimal
 import operator
 import random
+import warnings
 from itertools import combinations_with_replacement, product
 
 import cupy as cp
@@ -149,6 +150,33 @@
     lambda x: cudf.Scalar(0) / x,
 ]
 
+_series_or_index_names = [
+    None,
+    pd.NA,
+    cudf.NA,
+    np.nan,
+    float("NaN"),
+    "abc",
+    1,
+    pd.NaT,
+    np.datetime64("nat"),
+    np.timedelta64("NaT"),
+    np.timedelta64(10, "D"),
+    np.timedelta64(5, "D"),
+    np.datetime64("1970-01-01 00:00:00.000000001"),
+    np.datetime64("1970-01-01 00:00:00.000000002"),
+    pd.Timestamp(1),
+    pd.Timestamp(2),
+    pd.Timedelta(1),
+    pd.Timedelta(2),
+    decimal.Decimal("NaN"),
+    decimal.Decimal("1.2"),
+    np.int64(1),
+    np.int32(1),
+    np.float32(1),
+    pd.Timestamp(1),
+]
+
 pytest_xfail = pytest.mark.xfail
 pytestmark = pytest.mark.spilling
 
@@ -3265,3 +3293,47 @@ def test_binop_integer_power_int_scalar():
     expected = base**exponent.value
     got = base**exponent
     utils.assert_eq(expected, got)
+
+
+@pytest.mark.parametrize("op", _binops)
+def test_binop_index_series(op):
+    gi = cudf.Index([10, 11, 12])
+    gs = cudf.Series([1, 2, 3])
+
+    actual = op(gi, gs)
+    expected = op(gi.to_pandas(), gs.to_pandas())
+
+    utils.assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("name1", _series_or_index_names)
+@pytest.mark.parametrize("name2", _series_or_index_names)
+def test_binop_index_dt_td_series_with_names(name1, name2):
+    gi = cudf.Index([1, 2, 3], dtype="datetime64[ns]", name=name1)
+    gs = cudf.Series([10, 11, 12], dtype="timedelta64[ns]", name=name2)
+    with warnings.catch_warnings():
+        # Numpy raises a deprecation warning:
+        # "elementwise comparison failed; this will raise an error "
+        warnings.simplefilter("ignore", (DeprecationWarning,))
+
+        expected = gi.to_pandas() + gs.to_pandas()
+    actual = gi + gs
+
+    utils.assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("data1", [[1, 2, 3], [10, 11, None]])
+@pytest.mark.parametrize("data2", [[1, 2, 3], [10, 11, None]])
+def test_binop_eq_ne_index_series(data1, data2):
+    gi = cudf.Index(data1, dtype="datetime64[ns]", name=np.nan)
+    gs = cudf.Series(data2, dtype="timedelta64[ns]", name="abc")
+
+    actual = gi == gs
+    expected = gi.to_pandas() == gs.to_pandas()
+
+    utils.assert_eq(expected, actual)
+
+    actual = gi != gs
+    expected = gi.to_pandas() != gs.to_pandas()
+
+    utils.assert_eq(expected, actual)
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
+import decimal
 import functools
 import hashlib
 import os
@@ -384,3 +385,49 @@ def wrapper(self):
         return fn(self)
 
     return wrapper
+
+
+def _is_same_name(left_name, right_name):
+    # Internal utility to compare if two names are same.
+    with warnings.catch_warnings():
+        # numpy throws warnings while comparing
+        # NaT values with non-NaT values.
+        warnings.simplefilter("ignore")
+        try:
+            same = (left_name is right_name) or (left_name == right_name)
+            if not same:
+                if isinstance(left_name, decimal.Decimal) and isinstance(
+                    right_name, decimal.Decimal
+                ):
+                    return left_name.is_nan() and right_name.is_nan()
+                if isinstance(left_name, float) and isinstance(
+                    right_name, float
+                ):
+                    return np.isnan(left_name) and np.isnan(right_name)
+                if isinstance(left_name, np.datetime64) and isinstance(
+                    right_name, np.datetime64
+                ):
+                    return np.isnan(left_name) and np.isnan(right_name)
+            return same
+        except TypeError:
+            return False
+
+
+def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
+    # Internal utility to construct a boolean column
+    # by combining nulls from `lhs` & `rhs`.
+    if lhs.has_nulls() and rhs.has_nulls():
+        result_mask = lhs._get_mask_as_column() & rhs._get_mask_as_column()
+    elif lhs.has_nulls():
+        result_mask = lhs._get_mask_as_column()
+    elif rhs.has_nulls():
+        result_mask = rhs._get_mask_as_column()
+    else:
+        result_mask = None
+
+    result_col = column.full(
+        size=len(lhs), fill_value=bool_fill_value, dtype=cudf.dtype(np.bool_)
+    )
+    if result_mask is not None:
+        result_col = result_col.set_mask(result_mask.as_mask())
+    return result_col