Skip to content

Commit

Permalink
Fix binary operations between Series and Index (#13842)
Browse files Browse the repository at this point in the history
Fixes: #13836 

This PR fixes binary operations between `Series` & `Index` (and vice-versa) by properly handling the return type and the names being assigned to the return results. This PR also adds an early exit that will return `False` for `eq` operations when the other column being compared is not of the same type.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Matthew Roeschke (https://github.com/mroeschke)

URL: #13842
  • Loading branch information
galipremsagar authored Aug 10, 2023
1 parent 2801a27 commit b743cc7
Show file tree
Hide file tree
Showing 9 changed files with 174 additions and 19 deletions.
6 changes: 2 additions & 4 deletions python/cudf/cudf/core/_base_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,6 +31,7 @@
from cudf.core.column_accessor import ColumnAccessor
from cudf.utils import ioutils
from cudf.utils.dtypes import is_mixed_with_object_dtype
from cudf.utils.utils import _is_same_name


class BaseIndex(Serializable):
Expand Down Expand Up @@ -2010,7 +2011,4 @@ def _split(self, splits):


def _get_result_name(left_name, right_name):
if left_name == right_name:
return left_name
else:
return None
return left_name if _is_same_name(left_name, right_name) else None
22 changes: 15 additions & 7 deletions python/cudf/cudf/core/column/datetime.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,7 @@
from cudf.core.column import ColumnBase, as_column, column, string
from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
from cudf.utils.dtypes import _get_base_dtype
from cudf.utils.utils import _fillna_natwise
from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise

_guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format

Expand Down Expand Up @@ -424,12 +424,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
)
lhs, rhs = (other, self) if reflect else (self, other)
out_dtype = None
if op in {
"__eq__",
"NULL_EQUALS",
}:
out_dtype = cudf.dtype(np.bool_)
elif (

if (
op
in {
"__ne__",
Expand All @@ -455,6 +451,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
# well-defined if this operation was not invoked via reflection.
elif other_is_timedelta and not reflect:
out_dtype = _resolve_mixed_dtypes(lhs, rhs, "datetime64")
elif op in {
"__eq__",
"NULL_EQUALS",
"__ne__",
}:
out_dtype = cudf.dtype(np.bool_)
if isinstance(other, ColumnBase) and not isinstance(
other, DatetimeColumn
):
return _all_bools_with_nulls(
self, other, bool_fill_value=op == "__ne__"
)

if out_dtype is None:
return NotImplemented
Expand Down
9 changes: 8 additions & 1 deletion python/cudf/cudf/core/column/timedelta.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,7 +16,7 @@
from cudf.core.buffer import Buffer, acquire_spill_lock
from cudf.core.column import ColumnBase, column, string
from cudf.utils.dtypes import np_to_pa_dtype
from cudf.utils.utils import _fillna_natwise
from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise

_dtype_to_format_conversion = {
"timedelta64[ns]": "%D days %H:%M:%S",
Expand Down Expand Up @@ -202,6 +202,13 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
elif other.dtype.kind in {"f", "i", "u"}:
if op in {"__mul__", "__mod__", "__truediv__", "__floordiv__"}:
out_dtype = self.dtype
elif op in {"__eq__", "NULL_EQUALS", "__ne__"}:
if isinstance(other, ColumnBase) and not isinstance(
other, TimeDeltaColumn
):
return _all_bools_with_nulls(
self, other, bool_fill_value=op == "__ne__"
)

if out_dtype is None:
return NotImplemented
Expand Down
22 changes: 19 additions & 3 deletions python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,7 @@
)
from cudf.utils.utils import (
_cudf_nvtx_annotate,
_is_same_name,
_warn_no_dask_cudf,
search_range,
)
Expand Down Expand Up @@ -1029,12 +1030,27 @@ def _binaryop(
operands = self._make_operands_for_binop(other, fill_value, reflect)
if operands is NotImplemented:
return NotImplemented
ret = _index_from_data(self._colwise_binop(operands, op))
binop_result = self._colwise_binop(operands, op)

if isinstance(other, cudf.Series):
ret = other._from_data_like_self(binop_result)
ret.name = (
self.name
if cudf.utils.utils._is_same_name(self.name, other.name)
else None
)
else:
ret = _index_from_data(binop_result)

# pandas returns numpy arrays when the outputs are boolean. We
# explicitly _do not_ use isinstance here: we want only boolean
# GenericIndexes, not dtype-specific subclasses.
if type(ret) is GenericIndex and ret.dtype.kind == "b":
if (
isinstance(ret, (GenericIndex, cudf.Series))
and ret.dtype.kind == "b"
):
if ret._column.has_nulls():
ret = ret.fillna(op == "__ne__")
return ret.values
return ret

Expand Down Expand Up @@ -3309,7 +3325,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
if isinstance(arbitrary, cudf.MultiIndex):
return arbitrary
elif isinstance(arbitrary, BaseIndex):
if arbitrary.name == kwargs["name"]:
if _is_same_name(arbitrary.name, kwargs["name"]):
return arbitrary
idx = arbitrary.copy(deep=False)
idx.rename(kwargs["name"], inplace=True)
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/core/multiindex.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,7 +25,7 @@
from cudf.core._compat import PANDAS_GE_150
from cudf.core.frame import Frame
from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index
from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate
from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate, _is_same_name


def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]:
Expand Down Expand Up @@ -1891,7 +1891,7 @@ def _maybe_match_names(self, other):
if len(self.names) != len(other.names):
return [None] * len(self.names)
return [
self_name if self_name == other_name else None
self_name if _is_same_name(self_name, other_name) else None
for self_name, other_name in zip(self.names, other.names)
]

Expand Down
7 changes: 6 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -1505,9 +1505,14 @@ def _make_operands_and_index_for_binop(
"Can only compare identically-labeled Series objects"
)
lhs, other = _align_indices([self, other], allow_non_unique=True)
can_use_self_column_name = self.name == other.name
else:
lhs = self

try:
can_use_self_column_name = cudf.utils.utils._is_same_name(
self.name, other.name
)
except AttributeError:
can_use_self_column_name = False

operands = lhs._make_operands_for_binop(other, fill_value, reflect)
Expand Down
4 changes: 3 additions & 1 deletion python/cudf/cudf/core/single_column_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -327,7 +327,9 @@ def _make_operands_for_binop(
# Get the appropriate name for output operations involving two objects
# that are Series-like objects. The output shares the lhs's name unless
# the rhs is a _differently_ named Series-like object.
if isinstance(other, SingleColumnFrame) and self.name != other.name:
if isinstance(
other, SingleColumnFrame
) and not cudf.utils.utils._is_same_name(self.name, other.name):
result_name = None
else:
result_name = self.name
Expand Down
72 changes: 72 additions & 0 deletions python/cudf/cudf/tests/test_binops.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@
import decimal
import operator
import random
import warnings
from itertools import combinations_with_replacement, product

import cupy as cp
Expand Down Expand Up @@ -149,6 +150,33 @@
lambda x: cudf.Scalar(0) / x,
]

_series_or_index_names = [
None,
pd.NA,
cudf.NA,
np.nan,
float("NaN"),
"abc",
1,
pd.NaT,
np.datetime64("nat"),
np.timedelta64("NaT"),
np.timedelta64(10, "D"),
np.timedelta64(5, "D"),
np.datetime64("1970-01-01 00:00:00.000000001"),
np.datetime64("1970-01-01 00:00:00.000000002"),
pd.Timestamp(1),
pd.Timestamp(2),
pd.Timedelta(1),
pd.Timedelta(2),
decimal.Decimal("NaN"),
decimal.Decimal("1.2"),
np.int64(1),
np.int32(1),
np.float32(1),
pd.Timestamp(1),
]

pytest_xfail = pytest.mark.xfail
pytestmark = pytest.mark.spilling

Expand Down Expand Up @@ -3265,3 +3293,47 @@ def test_binop_integer_power_int_scalar():
expected = base**exponent.value
got = base**exponent
utils.assert_eq(expected, got)


@pytest.mark.parametrize("op", _binops)
def test_binop_index_series(op):
gi = cudf.Index([10, 11, 12])
gs = cudf.Series([1, 2, 3])

actual = op(gi, gs)
expected = op(gi.to_pandas(), gs.to_pandas())

utils.assert_eq(expected, actual)


@pytest.mark.parametrize("name1", _series_or_index_names)
@pytest.mark.parametrize("name2", _series_or_index_names)
def test_binop_index_dt_td_series_with_names(name1, name2):
gi = cudf.Index([1, 2, 3], dtype="datetime64[ns]", name=name1)
gs = cudf.Series([10, 11, 12], dtype="timedelta64[ns]", name=name2)
with warnings.catch_warnings():
# Numpy raises a deprecation warning:
# "elementwise comparison failed; this will raise an error "
warnings.simplefilter("ignore", (DeprecationWarning,))

expected = gi.to_pandas() + gs.to_pandas()
actual = gi + gs

utils.assert_eq(expected, actual)


@pytest.mark.parametrize("data1", [[1, 2, 3], [10, 11, None]])
@pytest.mark.parametrize("data2", [[1, 2, 3], [10, 11, None]])
def test_binop_eq_ne_index_series(data1, data2):
gi = cudf.Index(data1, dtype="datetime64[ns]", name=np.nan)
gs = cudf.Series(data2, dtype="timedelta64[ns]", name="abc")

actual = gi == gs
expected = gi.to_pandas() == gs.to_pandas()

utils.assert_eq(expected, actual)

actual = gi != gs
expected = gi.to_pandas() != gs.to_pandas()

utils.assert_eq(expected, actual)
47 changes: 47 additions & 0 deletions python/cudf/cudf/utils/utils.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

import decimal
import functools
import hashlib
import os
Expand Down Expand Up @@ -384,3 +385,49 @@ def wrapper(self):
return fn(self)

return wrapper


def _is_same_name(left_name, right_name):
# Internal utility to compare if two names are same.
with warnings.catch_warnings():
# numpy throws warnings while comparing
# NaT values with non-NaT values.
warnings.simplefilter("ignore")
try:
same = (left_name is right_name) or (left_name == right_name)
if not same:
if isinstance(left_name, decimal.Decimal) and isinstance(
right_name, decimal.Decimal
):
return left_name.is_nan() and right_name.is_nan()
if isinstance(left_name, float) and isinstance(
right_name, float
):
return np.isnan(left_name) and np.isnan(right_name)
if isinstance(left_name, np.datetime64) and isinstance(
right_name, np.datetime64
):
return np.isnan(left_name) and np.isnan(right_name)
return same
except TypeError:
return False


def _all_bools_with_nulls(lhs, rhs, bool_fill_value):
# Internal utility to construct a boolean column
# by combining nulls from `lhs` & `rhs`.
if lhs.has_nulls() and rhs.has_nulls():
result_mask = lhs._get_mask_as_column() & rhs._get_mask_as_column()
elif lhs.has_nulls():
result_mask = lhs._get_mask_as_column()
elif rhs.has_nulls():
result_mask = rhs._get_mask_as_column()
else:
result_mask = None

result_col = column.full(
size=len(lhs), fill_value=bool_fill_value, dtype=cudf.dtype(np.bool_)
)
if result_mask is not None:
result_col = result_col.set_mask(result_mask.as_mask())
return result_col

0 comments on commit b743cc7

Please sign in to comment.