diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index 981a8323138..d593f0df138 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -31,6 +31,7 @@ from cudf.core.column_accessor import ColumnAccessor from cudf.utils import ioutils from cudf.utils.dtypes import is_mixed_with_object_dtype +from cudf.utils.utils import _is_same_name class BaseIndex(Serializable): @@ -2010,7 +2011,4 @@ def _split(self, splits): def _get_result_name(left_name, right_name): - if left_name == right_name: - return left_name - else: - return None + return left_name if _is_same_name(left_name, right_name) else None diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py index 84d283d3f22..8073092775d 100644 --- a/python/cudf/cudf/core/column/datetime.py +++ b/python/cudf/cudf/core/column/datetime.py @@ -31,7 +31,7 @@ from cudf.core.column import ColumnBase, as_column, column, string from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion from cudf.utils.dtypes import _get_base_dtype -from cudf.utils.utils import _fillna_natwise +from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise _guess_datetime_format = pd.core.tools.datetimes.guess_datetime_format @@ -424,12 +424,8 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: ) lhs, rhs = (other, self) if reflect else (self, other) out_dtype = None - if op in { - "__eq__", - "NULL_EQUALS", - }: - out_dtype = cudf.dtype(np.bool_) - elif ( + + if ( op in { "__ne__", @@ -455,6 +451,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: # well-defined if this operation was not invoked via reflection. elif other_is_timedelta and not reflect: out_dtype = _resolve_mixed_dtypes(lhs, rhs, "datetime64") + elif op in { + "__eq__", + "NULL_EQUALS", + "__ne__", + }: + out_dtype = cudf.dtype(np.bool_) + if isinstance(other, ColumnBase) and not isinstance( + other, DatetimeColumn + ): + return _all_bools_with_nulls( + self, other, bool_fill_value=op == "__ne__" + ) if out_dtype is None: return NotImplemented diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py index 272f6e20985..b571461b307 100644 --- a/python/cudf/cudf/core/column/timedelta.py +++ b/python/cudf/cudf/core/column/timedelta.py @@ -16,7 +16,7 @@ from cudf.core.buffer import Buffer, acquire_spill_lock from cudf.core.column import ColumnBase, column, string from cudf.utils.dtypes import np_to_pa_dtype -from cudf.utils.utils import _fillna_natwise +from cudf.utils.utils import _all_bools_with_nulls, _fillna_natwise _dtype_to_format_conversion = { "timedelta64[ns]": "%D days %H:%M:%S", @@ -202,6 +202,13 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase: elif other.dtype.kind in {"f", "i", "u"}: if op in {"__mul__", "__mod__", "__truediv__", "__floordiv__"}: out_dtype = self.dtype + elif op in {"__eq__", "NULL_EQUALS", "__ne__"}: + if isinstance(other, ColumnBase) and not isinstance( + other, TimeDeltaColumn + ): + return _all_bools_with_nulls( + self, other, bool_fill_value=op == "__ne__" + ) if out_dtype is None: return NotImplemented diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 44a1620da8a..fa9e63fa4cc 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -65,6 +65,7 @@ ) from cudf.utils.utils import ( _cudf_nvtx_annotate, + _is_same_name, _warn_no_dask_cudf, search_range, ) @@ -1029,12 +1030,27 @@ def _binaryop( operands = self._make_operands_for_binop(other, fill_value, reflect) if operands is NotImplemented: return NotImplemented - ret = _index_from_data(self._colwise_binop(operands, op)) + binop_result = self._colwise_binop(operands, op) + + if isinstance(other, cudf.Series): + ret = other._from_data_like_self(binop_result) + ret.name = ( + self.name + if cudf.utils.utils._is_same_name(self.name, other.name) + else None + ) + else: + ret = _index_from_data(binop_result) # pandas returns numpy arrays when the outputs are boolean. We # explicitly _do not_ use isinstance here: we want only boolean # GenericIndexes, not dtype-specific subclasses. - if type(ret) is GenericIndex and ret.dtype.kind == "b": + if ( + isinstance(ret, (GenericIndex, cudf.Series)) + and ret.dtype.kind == "b" + ): + if ret._column.has_nulls(): + ret = ret.fillna(op == "__ne__") return ret.values return ret @@ -3309,7 +3325,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex: if isinstance(arbitrary, cudf.MultiIndex): return arbitrary elif isinstance(arbitrary, BaseIndex): - if arbitrary.name == kwargs["name"]: + if _is_same_name(arbitrary.name, kwargs["name"]): return arbitrary idx = arbitrary.copy(deep=False) idx.rename(kwargs["name"], inplace=True) diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index 9285a21f696..bb0c25a9970 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -25,7 +25,7 @@ from cudf.core._compat import PANDAS_GE_150 from cudf.core.frame import Frame from cudf.core.index import BaseIndex, _lexsorted_equal_range, as_index -from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate +from cudf.utils.utils import NotIterable, _cudf_nvtx_annotate, _is_same_name def _maybe_indices_to_slice(indices: cp.ndarray) -> Union[slice, cp.ndarray]: @@ -1891,7 +1891,7 @@ def _maybe_match_names(self, other): if len(self.names) != len(other.names): return [None] * len(self.names) return [ - self_name if self_name == other_name else None + self_name if _is_same_name(self_name, other_name) else None for self_name, other_name in zip(self.names, other.names) ] diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index 6ff0584538e..a0cec69eca9 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -1505,9 +1505,14 @@ def _make_operands_and_index_for_binop( "Can only compare identically-labeled Series objects" ) lhs, other = _align_indices([self, other], allow_non_unique=True) - can_use_self_column_name = self.name == other.name else: lhs = self + + try: + can_use_self_column_name = cudf.utils.utils._is_same_name( + self.name, other.name + ) + except AttributeError: can_use_self_column_name = False operands = lhs._make_operands_for_binop(other, fill_value, reflect) diff --git a/python/cudf/cudf/core/single_column_frame.py b/python/cudf/cudf/core/single_column_frame.py index ffb432ed14a..7c019f0722c 100644 --- a/python/cudf/cudf/core/single_column_frame.py +++ b/python/cudf/cudf/core/single_column_frame.py @@ -327,7 +327,9 @@ def _make_operands_for_binop( # Get the appropriate name for output operations involving two objects # that are Series-like objects. The output shares the lhs's name unless # the rhs is a _differently_ named Series-like object. - if isinstance(other, SingleColumnFrame) and self.name != other.name: + if isinstance( + other, SingleColumnFrame + ) and not cudf.utils.utils._is_same_name(self.name, other.name): result_name = None else: result_name = self.name diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py index 6307c3655c1..4965fb728c2 100644 --- a/python/cudf/cudf/tests/test_binops.py +++ b/python/cudf/cudf/tests/test_binops.py @@ -3,6 +3,7 @@ import decimal import operator import random +import warnings from itertools import combinations_with_replacement, product import cupy as cp @@ -149,6 +150,33 @@ lambda x: cudf.Scalar(0) / x, ] +_series_or_index_names = [ + None, + pd.NA, + cudf.NA, + np.nan, + float("NaN"), + "abc", + 1, + pd.NaT, + np.datetime64("nat"), + np.timedelta64("NaT"), + np.timedelta64(10, "D"), + np.timedelta64(5, "D"), + np.datetime64("1970-01-01 00:00:00.000000001"), + np.datetime64("1970-01-01 00:00:00.000000002"), + pd.Timestamp(1), + pd.Timestamp(2), + pd.Timedelta(1), + pd.Timedelta(2), + decimal.Decimal("NaN"), + decimal.Decimal("1.2"), + np.int64(1), + np.int32(1), + np.float32(1), + pd.Timestamp(1), +] + pytest_xfail = pytest.mark.xfail pytestmark = pytest.mark.spilling @@ -3265,3 +3293,47 @@ def test_binop_integer_power_int_scalar(): expected = base**exponent.value got = base**exponent utils.assert_eq(expected, got) + + +@pytest.mark.parametrize("op", _binops) +def test_binop_index_series(op): + gi = cudf.Index([10, 11, 12]) + gs = cudf.Series([1, 2, 3]) + + actual = op(gi, gs) + expected = op(gi.to_pandas(), gs.to_pandas()) + + utils.assert_eq(expected, actual) + + +@pytest.mark.parametrize("name1", _series_or_index_names) +@pytest.mark.parametrize("name2", _series_or_index_names) +def test_binop_index_dt_td_series_with_names(name1, name2): + gi = cudf.Index([1, 2, 3], dtype="datetime64[ns]", name=name1) + gs = cudf.Series([10, 11, 12], dtype="timedelta64[ns]", name=name2) + with warnings.catch_warnings(): + # Numpy raises a deprecation warning: + # "elementwise comparison failed; this will raise an error " + warnings.simplefilter("ignore", (DeprecationWarning,)) + + expected = gi.to_pandas() + gs.to_pandas() + actual = gi + gs + + utils.assert_eq(expected, actual) + + +@pytest.mark.parametrize("data1", [[1, 2, 3], [10, 11, None]]) +@pytest.mark.parametrize("data2", [[1, 2, 3], [10, 11, None]]) +def test_binop_eq_ne_index_series(data1, data2): + gi = cudf.Index(data1, dtype="datetime64[ns]", name=np.nan) + gs = cudf.Series(data2, dtype="timedelta64[ns]", name="abc") + + actual = gi == gs + expected = gi.to_pandas() == gs.to_pandas() + + utils.assert_eq(expected, actual) + + actual = gi != gs + expected = gi.to_pandas() != gs.to_pandas() + + utils.assert_eq(expected, actual) diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py index 9298d283b64..dadd9aa2cc4 100644 --- a/python/cudf/cudf/utils/utils.py +++ b/python/cudf/cudf/utils/utils.py @@ -1,5 +1,6 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. +import decimal import functools import hashlib import os @@ -384,3 +385,49 @@ def wrapper(self): return fn(self) return wrapper + + +def _is_same_name(left_name, right_name): + # Internal utility to compare if two names are same. + with warnings.catch_warnings(): + # numpy throws warnings while comparing + # NaT values with non-NaT values. + warnings.simplefilter("ignore") + try: + same = (left_name is right_name) or (left_name == right_name) + if not same: + if isinstance(left_name, decimal.Decimal) and isinstance( + right_name, decimal.Decimal + ): + return left_name.is_nan() and right_name.is_nan() + if isinstance(left_name, float) and isinstance( + right_name, float + ): + return np.isnan(left_name) and np.isnan(right_name) + if isinstance(left_name, np.datetime64) and isinstance( + right_name, np.datetime64 + ): + return np.isnan(left_name) and np.isnan(right_name) + return same + except TypeError: + return False + + +def _all_bools_with_nulls(lhs, rhs, bool_fill_value): + # Internal utility to construct a boolean column + # by combining nulls from `lhs` & `rhs`. + if lhs.has_nulls() and rhs.has_nulls(): + result_mask = lhs._get_mask_as_column() & rhs._get_mask_as_column() + elif lhs.has_nulls(): + result_mask = lhs._get_mask_as_column() + elif rhs.has_nulls(): + result_mask = rhs._get_mask_as_column() + else: + result_mask = None + + result_col = column.full( + size=len(lhs), fill_value=bool_fill_value, dtype=cudf.dtype(np.bool_) + ) + if result_mask is not None: + result_col = result_col.set_mask(result_mask.as_mask()) + return result_col