diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi index bafa1c914fd..dafaa8f4d1d 100644 --- a/python/cudf/cudf/_lib/column.pyi +++ b/python/cudf/cudf/_lib/column.pyi @@ -2,7 +2,7 @@ from __future__ import annotations -from typing import Optional, Tuple, TypeVar, Union +from typing import Dict, Optional, Tuple, TypeVar from cudf._typing import Dtype, DtypeObj, ScalarLike from cudf.core.buffer import Buffer @@ -21,6 +21,7 @@ class Column: _null_count: int _children: Tuple[ColumnBase, ...] _base_children: Tuple[ColumnBase, ...] + _distinct_count: Dict[bool, int] def __init__( self, diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index 466be8dd21e..9a89cc2e97e 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -80,6 +80,7 @@ cdef class Column: self._size = size self._cached_sizeof = None + self._distinct_count = {} self._dtype = dtype self._offset = offset self._null_count = null_count @@ -203,9 +204,14 @@ cdef class Column: raise ValueError(error_msg) self._mask = None - self._null_count = None self._children = None self._base_mask = value + self._clear_cache() + + def _clear_cache(self): + self._distinct_count = {} + self._cached_sizeof = None + self._null_count = None def set_mask(self, value): """ diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py index d18b536fa65..590bff3b19d 100644 --- a/python/cudf/cudf/core/_base_index.py +++ b/python/cudf/cudf/core/_base_index.py @@ -261,6 +261,211 @@ def set_names(self, names, level=None, inplace=False): return self._set_names(names=names, inplace=inplace) + @property + def has_duplicates(self): + return not self.is_unique + + def union(self, other, sort=None): + """ + Form the union of two Index objects. + + Parameters + ---------- + other : Index or array-like + sort : bool or None, default None + Whether to sort the resulting Index. + + * None : Sort the result, except when + + 1. `self` and `other` are equal. + 2. `self` or `other` has length 0. + + * False : do not sort the result. + + Returns + ------- + union : Index + + Examples + -------- + Union of an Index + >>> import cudf + >>> import pandas as pd + >>> idx1 = cudf.Index([1, 2, 3, 4]) + >>> idx2 = cudf.Index([3, 4, 5, 6]) + >>> idx1.union(idx2) + Int64Index([1, 2, 3, 4, 5, 6], dtype='int64') + + MultiIndex case + + >>> idx1 = cudf.MultiIndex.from_pandas( + ... pd.MultiIndex.from_arrays( + ... [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ... ) + ... ) + >>> idx1 + MultiIndex([(1, 'Red'), + (1, 'Blue'), + (2, 'Red'), + (2, 'Blue')], + ) + >>> idx2 = cudf.MultiIndex.from_pandas( + ... pd.MultiIndex.from_arrays( + ... [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] + ... ) + ... ) + >>> idx2 + MultiIndex([(3, 'Red'), + (3, 'Green'), + (2, 'Red'), + (2, 'Green')], + ) + >>> idx1.union(idx2) + MultiIndex([(1, 'Blue'), + (1, 'Red'), + (2, 'Blue'), + (2, 'Green'), + (2, 'Red'), + (3, 'Green'), + (3, 'Red')], + ) + >>> idx1.union(idx2, sort=False) + MultiIndex([(1, 'Red'), + (1, 'Blue'), + (2, 'Red'), + (2, 'Blue'), + (3, 'Red'), + (3, 'Green'), + (2, 'Green')], + ) + """ + if not isinstance(other, BaseIndex): + other = cudf.Index(other, name=self.name) + + if sort not in {None, False}: + raise ValueError( + f"The 'sort' keyword only takes the values of " + f"None or False; {sort} was passed." + ) + + if not len(other) or self.equals(other): + return self._get_reconciled_name_object(other) + elif not len(self): + return other._get_reconciled_name_object(self) + + result = self._union(other, sort=sort) + result.name = _get_result_name(self.name, other.name) + return result + + def intersection(self, other, sort=False): + """ + Form the intersection of two Index objects. + + This returns a new Index with elements common to the index and `other`. + + Parameters + ---------- + other : Index or array-like + sort : False or None, default False + Whether to sort the resulting index. + + * False : do not sort the result. + * None : sort the result, except when `self` and `other` are equal + or when the values cannot be compared. + + Returns + ------- + intersection : Index + + Examples + -------- + >>> import cudf + >>> import pandas as pd + >>> idx1 = cudf.Index([1, 2, 3, 4]) + >>> idx2 = cudf.Index([3, 4, 5, 6]) + >>> idx1.intersection(idx2) + Int64Index([3, 4], dtype='int64') + + MultiIndex case + + >>> idx1 = cudf.MultiIndex.from_pandas( + ... pd.MultiIndex.from_arrays( + ... [[1, 1, 3, 4], ["Red", "Blue", "Red", "Blue"]] + ... ) + ... ) + >>> idx2 = cudf.MultiIndex.from_pandas( + ... pd.MultiIndex.from_arrays( + ... [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ... ) + ... ) + >>> idx1 + MultiIndex([(1, 'Red'), + (1, 'Blue'), + (3, 'Red'), + (4, 'Blue')], + ) + >>> idx2 + MultiIndex([(1, 'Red'), + (1, 'Blue'), + (2, 'Red'), + (2, 'Blue')], + ) + >>> idx1.intersection(idx2) + MultiIndex([(1, 'Red'), + (1, 'Blue')], + ) + >>> idx1.intersection(idx2, sort=False) + MultiIndex([(1, 'Red'), + (1, 'Blue')], + ) + """ + if not isinstance(other, BaseIndex): + other = cudf.Index(other, name=self.name) + + if sort not in {None, False}: + raise ValueError( + f"The 'sort' keyword only takes the values of " + f"None or False; {sort} was passed." + ) + + if self.equals(other): + if self.has_duplicates: + return self.unique()._get_reconciled_name_object(other) + return self._get_reconciled_name_object(other) + + res_name = _get_result_name(self.name, other.name) + + if (self.is_boolean() and other.is_numeric()) or ( + self.is_numeric() and other.is_boolean() + ): + if isinstance(self, cudf.MultiIndex): + return self[:0].rename(res_name) + else: + return cudf.Index([], name=res_name) + + if self.has_duplicates: + lhs = self.unique() + else: + lhs = self + if other.has_duplicates: + rhs = other.unique() + else: + rhs = other + result = lhs._intersection(rhs, sort=sort) + result.name = res_name + return result + + def _get_reconciled_name_object(self, other): + """ + If the result of a set operation will be self, + return self, unless the name changes, in which + case make a shallow copy of self. + """ + name = _get_result_name(self.name, other.name) + if self.name != name: + return self.rename(name) + return self + def fillna(self, value, downcast=None): """ Fill null values with the specified value. @@ -544,6 +749,282 @@ def difference(self, other, sort=None): return difference + def is_numeric(self): + """ + Check if the Index only consists of numeric data. + + Returns + ------- + bool + Whether or not the Index only consists of numeric data. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_numeric() + True + >>> idx = cudf.Index([1, 2, 3, 4.0]) + >>> idx.is_numeric() + True + >>> idx = cudf.Index([1, 2, 3, 4]) + >>> idx.is_numeric() + True + >>> idx = cudf.Index([1, 2, 3, 4.0, np.nan]) + >>> idx.is_numeric() + True + >>> idx = cudf.Index(["Apple", "cold"]) + >>> idx.is_numeric() + False + """ + raise NotImplementedError + + def is_boolean(self): + """ + Check if the Index only consists of booleans. + + Returns + ------- + bool + Whether or not the Index only consists of booleans. + + See Also + -------- + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([True, False, True]) + >>> idx.is_boolean() + True + >>> idx = cudf.Index(["True", "False", "True"]) + >>> idx.is_boolean() + False + >>> idx = cudf.Index([1, 2, 3]) + >>> idx.is_boolean() + False + """ + raise NotImplementedError + + def is_integer(self): + """ + Check if the Index only consists of integers. + + Returns + ------- + bool + Whether or not the Index only consists of integers. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([1, 2, 3, 4]) + >>> idx.is_integer() + True + >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_integer() + False + >>> idx = cudf.Index(["Apple", "Mango", "Watermelon"]) + >>> idx.is_integer() + False + """ + raise NotImplementedError + + def is_floating(self): + """ + Check if the Index is a floating type. + + The Index may consist of only floats, NaNs, or a mix of floats, + integers, or NaNs. + + Returns + ------- + bool + Whether or not the Index only consists of only consists + of floats, NaNs, or a mix of floats, integers, or NaNs. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_floating() + True + >>> idx = cudf.Index([1.0, 2.0, np.nan, 4.0]) + >>> idx.is_floating() + True + >>> idx = cudf.Index([1, 2, 3, 4, np.nan]) + >>> idx.is_floating() + True + >>> idx = cudf.Index([1, 2, 3, 4]) + >>> idx.is_floating() + False + """ + raise NotImplementedError + + def is_object(self): + """ + Check if the Index is of the object dtype. + + Returns + ------- + bool + Whether or not the Index is of the object dtype. + + See Also + -------- + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_categorical : Check if the Index holds categorical data. + is_interval : Check if the Index holds Interval objects. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(["Apple", "Mango", "Watermelon"]) + >>> idx.is_object() + True + >>> idx = cudf.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.is_object() + False + >>> idx = cudf.Index([1.0, 2.0, 3.0, 4.0]) + >>> idx.is_object() + False + """ + raise NotImplementedError + + def is_categorical(self): + """ + Check if the Index holds categorical data. + + Returns + ------- + bool + True if the Index is categorical. + + See Also + -------- + CategoricalIndex : Index for categorical data. + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_interval : Check if the Index holds Interval objects. + + Examples + -------- + >>> import cudf + >>> idx = cudf.Index(["Watermelon", "Orange", "Apple", + ... "Watermelon"]).astype("category") + >>> idx.is_categorical() + True + >>> idx = cudf.Index([1, 3, 5, 7]) + >>> idx.is_categorical() + False + >>> s = cudf.Series(["Peter", "Victor", "Elisabeth", "Mar"]) + >>> s + 0 Peter + 1 Victor + 2 Elisabeth + 3 Mar + dtype: object + >>> s.index.is_categorical() + False + """ + raise NotImplementedError + + def is_interval(self): + """ + Check if the Index holds Interval objects. + + Returns + ------- + bool + Whether or not the Index holds Interval objects. + + See Also + -------- + IntervalIndex : Index for Interval objects. + is_boolean : Check if the Index only consists of booleans. + is_integer : Check if the Index only consists of integers. + is_floating : Check if the Index is a floating type. + is_numeric : Check if the Index only consists of numeric data. + is_object : Check if the Index is of the object dtype. + is_categorical : Check if the Index holds categorical data. + + Examples + -------- + >>> import cudf + >>> idx = cudf.from_pandas( + ... pd.Index([pd.Interval(left=0, right=5), + ... pd.Interval(left=5, right=10)]) + ... ) + >>> idx.is_interval() + True + >>> idx = cudf.Index([1, 3, 5, 7]) + >>> idx.is_interval() + False + """ + raise NotImplementedError + + def _union(self, other, sort=None): + # TODO: As a future optimization we should explore + # not doing `to_frame` + self_df = self.to_frame(index=False, name=0) + other_df = other.to_frame(index=False, name=0) + self_df["order"] = self_df.index + other_df["order"] = other_df.index + res = self_df.merge(other_df, on=[0], how="outer") + res = res.sort_values(by=res.columns[1:], ignore_index=True) + union_result = cudf.core.index._index_from_data({0: res._data[0]}) + + if sort is None and len(other): + return union_result.sort_values() + return union_result + + def _intersection(self, other, sort=None): + intersection_result = self.unique().join(other.unique(), how="inner") + + if sort is None and len(other): + return intersection_result.sort_values() + return intersection_result + def sort_values(self, return_indexer=False, ascending=True, key=None): """ Return a sorted copy of the index, and optionally return the indices @@ -637,7 +1118,9 @@ def unique(self): ------- Index without duplicates """ - return cudf.Index(self._values.unique(), name=self.name) + return cudf.core.index._index_from_data( + {self.name: self._values.unique()}, name=self.name + ) def join( self, other, how="left", level=None, return_indexers=False, sort=False @@ -970,3 +1453,10 @@ def from_pandas(cls, index, nan_as_null=None): @property def _constructor_expanddim(self): return cudf.MultiIndex + + +def _get_result_name(left_name, right_name): + if left_name == right_name: + return left_name + else: + return None diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 18216def17e..ce4f9ba39a0 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -900,7 +900,13 @@ def distinct_count( if method != "sort": msg = "non sort based distinct_count() not implemented yet" raise NotImplementedError(msg) - return cpp_distinct_count(self, ignore_nulls=dropna) + try: + return self._distinct_count[dropna] + except KeyError: + self._distinct_count[dropna] = cpp_distinct_count( + self, ignore_nulls=dropna + ) + return self._distinct_count[dropna] def can_cast_safely(self, to_dtype: Dtype) -> bool: raise NotImplementedError() @@ -1128,6 +1134,12 @@ def unique(self) -> ColumnBase: """ Get unique values in the data """ + # TODO: We could avoid performing `drop_duplicates` for + # columns with values that already are unique. + # Few things to note before we can do this optimization is + # the following issue resolved: + # https://github.com/rapidsai/cudf/issues/5286 + return ( self.as_frame() .drop_duplicates(keep="first", ignore_index=True) diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index 7adb01a03bf..f7e8f302f90 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -145,6 +145,8 @@ class RangeIndex(BaseIndex): RangeIndex(start=1, stop=10, step=1, name='a') """ + _range: range + def __init__( self, start, stop=None, step=1, dtype=None, copy=False, name=None ): @@ -163,6 +165,10 @@ def __init__( self._step = int(step) if step is not None else 1 self._index = None self._name = name + self._range = range(self._start, self._stop, self._step) + # _end is the actual last element of RangeIndex, + # whereas _stop is an upper bound. + self._end = self._start + self._step * (len(self._range) - 1) def _copy_type_metadata( self, other: Frame, include_index: bool = True @@ -216,6 +222,27 @@ def _values(self): else: return column.column_empty(0, masked=False, dtype=self.dtype) + def is_numeric(self): + return True + + def is_boolean(self): + return False + + def is_integer(self): + return True + + def is_floating(self): + return False + + def is_object(self): + return False + + def is_categorical(self): + return False + + def is_interval(self): + return False + @property def _data(self): return cudf.core.column_accessor.ColumnAccessor( @@ -312,7 +339,7 @@ def equals(self, other): other._step, ): return True - return cudf.Int64Index._from_data(self._data).equals(other) + return Int64Index._from_data(self._data).equals(other) def serialize(self): header = {} @@ -476,7 +503,7 @@ def __rmul__(self, other): def _as_int64(self): # Convert self to an Int64Index. This method is used to perform ops # that are not defined directly on RangeIndex. - return cudf.Int64Index._from_data(self._data) + return Int64Index._from_data(self._data) def __getattr__(self, key): # For methods that are not defined for RangeIndex we attempt to operate @@ -521,6 +548,125 @@ def get_loc(self, key, method=None, tolerance=None): raise KeyError(key) return np.clip(round_method(idx), 0, idx_int_upper_bound, dtype=int) + def _union(self, other, sort=None): + if isinstance(other, RangeIndex): + # Variable suffixes are of the + # following notation: *_o -> other, *_s -> self, + # and *_r -> result + start_s, step_s = self.start, self.step + end_s = self._end + start_o, step_o = other.start, other.step + end_o = other._end + if self.step < 0: + start_s, step_s, end_s = end_s, -step_s, start_s + if other.step < 0: + start_o, step_o, end_o = end_o, -step_o, start_o + if len(self) == 1 and len(other) == 1: + step_s = step_o = abs(self.start - other.start) + elif len(self) == 1: + step_s = step_o + elif len(other) == 1: + step_o = step_s + + # Determine minimum start value of the result. + start_r = min(start_s, start_o) + # Determine maximum end value of the result. + end_r = max(end_s, end_o) + result = None + min_step = min(step_o, step_s) + + if ((start_s - start_o) % min_step) == 0: + # Checking to determine other is a subset of self with + # equal step size. + if ( + step_o == step_s + and (start_s - end_o) <= step_s + and (start_o - end_s) <= step_s + ): + result = type(self)(start_r, end_r + step_s, step_s) + # Checking if self is a subset of other with unequal + # step sizes. + elif ( + step_o % step_s == 0 + and (start_o + step_s >= start_s) + and (end_o - step_s <= end_s) + ): + result = type(self)(start_r, end_r + step_s, step_s) + # Checking if other is a subset of self with unequal + # step sizes. + elif ( + step_s % step_o == 0 + and (start_s + step_o >= start_o) + and (end_s - step_o <= end_o) + ): + result = type(self)(start_r, end_r + step_o, step_o) + # Checking to determine when the steps are even but one of + # the inputs spans across is near half or less then half + # the other input. This case needs manipulation to step + # size. + elif ( + step_o == step_s + and (step_s % 2 == 0) + and (abs(start_s - start_o) <= step_s / 2) + and (abs(end_s - end_o) <= step_s / 2) + ): + result = type(self)(start_r, end_r + step_s / 2, step_s / 2) + if result is not None: + if sort is None and not result.is_monotonic_increasing: + return result.sort_values() + else: + return result + + # If all the above optimizations don't cater to the inpputs, + # we materialize RangeIndex's into `Int64Index` and + # then perform `union`. + return Int64Index(self._values)._union(other, sort=sort) + + def _intersection(self, other, sort=False): + if not isinstance(other, RangeIndex): + return super()._intersection(other, sort=sort) + + if not len(self) or not len(other): + return RangeIndex(0) + + first = self._range[::-1] if self.step < 0 else self._range + second = other._range[::-1] if other.step < 0 else other._range + + # check whether intervals intersect + # deals with in- and decreasing ranges + int_low = max(first.start, second.start) + int_high = min(first.stop, second.stop) + if int_high <= int_low: + return RangeIndex(0) + + # Method hint: linear Diophantine equation + # solve intersection problem + # performance hint: for identical step sizes, could use + # cheaper alternative + gcd, s, _ = _extended_gcd(first.step, second.step) + + # check whether element sets intersect + if (first.start - second.start) % gcd: + return RangeIndex(0) + + # calculate parameters for the RangeIndex describing the + # intersection disregarding the lower bounds + tmp_start = ( + first.start + (second.start - first.start) * first.step // gcd * s + ) + new_step = first.step * second.step // gcd + no_steps = -(-(int_low - tmp_start) // abs(new_step)) + new_start = tmp_start + abs(new_step) * no_steps + new_range = range(new_start, int_high, new_step) + new_index = RangeIndex(new_range) + + if (self.step < 0 and other.step < 0) is not (new_index.step < 0): + new_index = new_index[::-1] + if sort is None: + new_index = new_index.sort_values() + + return new_index + # Patch in all binops and unary ops, which bypass __getattr__ on the instance # and prevent the above overload from working. @@ -994,6 +1140,27 @@ def find_label_range(self, first, last): def get_slice_bound(self, label, side, kind=None): return self._values.get_slice_bound(label, side, kind) + def is_numeric(self): + return False + + def is_boolean(self): + return True + + def is_integer(self): + return False + + def is_floating(self): + return False + + def is_object(self): + return False + + def is_categorical(self): + return False + + def is_interval(self): + return False + class NumericIndex(GenericIndex): """Immutable, ordered and sliceable sequence of labels. @@ -1029,6 +1196,27 @@ def __init__(self, data=None, dtype=None, copy=False, name=None): super().__init__(data, **kwargs) + def is_numeric(self): + return True + + def is_boolean(self): + return False + + def is_integer(self): + return True + + def is_floating(self): + return False + + def is_object(self): + return False + + def is_categorical(self): + return False + + def is_interval(self): + return False + class Int8Index(NumericIndex): """ @@ -1254,6 +1442,12 @@ class Float32Index(NumericIndex): _dtype = np.float32 + def is_integer(self): + return False + + def is_floating(self): + return True + class Float64Index(NumericIndex): """ @@ -1279,6 +1473,12 @@ class Float64Index(NumericIndex): _dtype = np.float64 + def is_integer(self): + return False + + def is_floating(self): + return True + class DatetimeIndex(GenericIndex): """ @@ -1654,6 +1854,9 @@ def _get_dt_field(self, field): ) return as_index(out_column, name=self.name) + def is_boolean(self): + return False + class TimedeltaIndex(GenericIndex): """ @@ -1782,6 +1985,9 @@ def inferred_freq(self): """ raise NotImplementedError("inferred_freq is not yet supported") + def is_boolean(self): + return False + class CategoricalIndex(GenericIndex): """ @@ -1894,6 +2100,12 @@ def categories(self): """ return as_index(self._values.categories) + def is_boolean(self): + return False + + def is_categorical(self): + return True + def interval_range( start=None, end=None, periods=None, freq=None, name=None, closed="right", @@ -2025,7 +2237,7 @@ def interval_range( if len(right_col) == 0 or len(left_col) == 0: dtype = IntervalDtype("int64", closed) data = column.column_empty_like_same_mask(left_col, dtype) - return cudf.IntervalIndex(data, closed=closed) + return IntervalIndex(data, closed=closed) interval_col = column.build_interval_column( left_col, right_col, closed=closed @@ -2122,6 +2334,12 @@ def from_breaks(breaks, closed="right", name=None, copy=False, dtype=None): return IntervalIndex(interval_col, name=name) + def is_interval(self): + return True + + def is_boolean(self): + return False + class StringIndex(GenericIndex): """String defined indices into another Column @@ -2182,6 +2400,12 @@ def _clean_nulls_from_index(self): else: return self + def is_boolean(self): + return False + + def is_object(self): + return True + def as_index(arbitrary, **kwargs) -> BaseIndex: """Create an Index from an arbitrary object @@ -2370,3 +2594,21 @@ def _concat_range_index(indexes: List[RangeIndex]) -> BaseIndex: stop = non_empty_indexes[-1].stop if next_ is None else next_ return RangeIndex(start, stop, step) + + +def _extended_gcd(a: int, b: int) -> Tuple[int, int, int]: + """ + Extended Euclidean algorithms to solve Bezout's identity: + a*x + b*y = gcd(x, y) + Finds one particular solution for x, y: s, t + Returns: gcd, s, t + """ + s, old_s = 0, 1 + t, old_t = 1, 0 + r, old_r = b, a + while r: + quotient = old_r // r + old_r, r = r, old_r - quotient * r + old_s, s = s, old_s - quotient * s + old_t, t = t, old_t - quotient * t + return old_r, old_s, old_t diff --git a/python/cudf/cudf/core/multiindex.py b/python/cudf/cudf/core/multiindex.py index e1274dc7758..f858a589614 100644 --- a/python/cudf/cudf/core/multiindex.py +++ b/python/cudf/cudf/core/multiindex.py @@ -941,6 +941,27 @@ def get_level_values(self, level): level_values = as_index(self._data[level], name=self.names[level_idx]) return level_values + def is_numeric(self): + return False + + def is_boolean(self): + return False + + def is_integer(self): + return False + + def is_floating(self): + return False + + def is_object(self): + return False + + def is_categorical(self): + return False + + def is_interval(self): + return False + @classmethod def _concat(cls, objs): @@ -1649,3 +1670,74 @@ def get_loc(self, key, method=None, tolerance=None): mask = cupy.full(self._data.nrows, False) mask[true_inds] = True return mask + + def _get_reconciled_name_object(self, other) -> MultiIndex: + """ + If the result of a set operation will be self, + return self, unless the names change, in which + case make a shallow copy of self. + """ + names = self._maybe_match_names(other) + if self.names != names: + return self.rename(names) + return self + + def _maybe_match_names(self, other): + """ + Try to find common names to attach to the result of an operation + between a and b. Return a consensus list of names if they match + at least partly or list of None if they have completely + different names. + """ + if len(self.names) != len(other.names): + return [None] * len(self.names) + return [ + self_name if self_name == other_name else None + for self_name, other_name in zip(self.names, other.names) + ] + + def _union(self, other, sort=None): + # TODO: When to_frame is refactored to return a + # deep copy in future, we should push most of the common + # logic between MultiIndex._union & BaseIndex._union into + # GenericIndex._union. + other_df = other.copy(deep=True).to_frame(index=False) + self_df = self.copy(deep=True).to_frame(index=False) + col_names = list(range(0, self.nlevels)) + self_df.columns = col_names + other_df.columns = col_names + self_df["order"] = self_df.index + other_df["order"] = other_df.index + + result_df = self_df.merge(other_df, on=col_names, how="outer") + result_df = result_df.sort_values( + by=result_df.columns[self.nlevels :], ignore_index=True + ) + + midx = MultiIndex.from_frame(result_df.iloc[:, : self.nlevels]) + midx.names = self.names if self.names == other.names else None + if sort is None and len(other): + return midx.sort_values() + return midx + + def _intersection(self, other, sort=None): + if self.names != other.names: + deep = True + col_names = list(range(0, self.nlevels)) + res_name = (None,) * self.nlevels + else: + deep = False + col_names = None + res_name = self.names + + other_df = other.copy(deep=deep).to_frame(index=False) + self_df = self.copy(deep=deep).to_frame(index=False) + if col_names is not None: + other_df.columns = col_names + self_df.columns = col_names + + result_df = cudf.merge(self_df, other_df, how="inner") + midx = self.__class__.from_frame(result_df, names=res_name) + if sort is None and len(other): + return midx.sort_values() + return midx diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py index a2155deb51e..ea9a1b9549f 100644 --- a/python/cudf/cudf/core/reshape.py +++ b/python/cudf/cudf/core/reshape.py @@ -10,7 +10,7 @@ _AXIS_MAP = {0: 0, 1: 1, "index": 0, "columns": 1} -def _align_objs(objs, how="outer"): +def _align_objs(objs, how="outer", sort=None): """Align a set of Series or Dataframe objects. Parameters @@ -18,16 +18,18 @@ def _align_objs(objs, how="outer"): objs : list of DataFrame, Series, or Index how : How to handle indexes on other axis (or axes), similar to join in concat + sort : Whether to sort the resulting Index Returns ------- - A bool for if indexes have matched and a set of - reindexed and aligned objects ready for concatenation + A list of reindexed and aligned objects + ready for concatenation """ # Check if multiindex then check if indexes match. GenericIndex # returns ndarray tuple of bools requiring additional filter. # Then check for duplicate index value. i_objs = iter(objs) first = next(i_objs) + not_matching_index = any( not first.index.equals(rest.index) for rest in i_objs ) @@ -38,36 +40,50 @@ def _align_objs(objs, how="outer"): index = objs[0].index name = index.name - if how == "inner" or isinstance(index, cudf.MultiIndex): - for obj in objs[1:]: - index = ( - cudf.DataFrame(index=obj.index) - .join(cudf.DataFrame(index=index), how=how) - .index - ) - index.name = name - return [obj.reindex(index) for obj in objs], False - else: - all_index_objs = [obj.index for obj in objs] - appended_index = all_index_objs[0].append(all_index_objs[1:]) - df = cudf.DataFrame( - { - "idxs": appended_index, - "order": cudf.core.column.arange( - start=0, stop=len(appended_index) - ), - } - ) - df = df.drop_duplicates(subset=["idxs"]).sort_values( - by=["order"], ascending=True - ) - final_index = df["idxs"] - final_index.name = name + final_index = _get_combined_index( + [obj.index for obj in objs], intersect=how == "inner", sort=sort + ) - return [obj.reindex(final_index) for obj in objs], False + final_index.name = name + return [ + obj.reindex(final_index) + if not final_index.equals(obj.index) + else obj + for obj in objs + ] else: - return objs, True + if sort: + if not first.index.is_monotonic_increasing: + final_index = first.index.sort_values() + return [obj.reindex(final_index) for obj in objs] + return objs + + +def _get_combined_index(indexes, intersect: bool = False, sort=None): + if len(indexes) == 0: + index = cudf.Index([]) + elif len(indexes) == 1: + index = indexes[0] + elif intersect: + sort = True + index = indexes[0] + for other in indexes[1:]: + # Don't sort for every intersection, + # let the sorting happen in the end. + index = index.intersection(other, sort=False) + else: + index = indexes[0] + if sort is None: + sort = False if isinstance(index, cudf.StringIndex) else True + for other in indexes[1:]: + index = index.union(other, sort=False) + + if sort: + if not index.is_monotonic_increasing: + index = index.sort_values() + + return index def _normalize_series_and_dataframe(objs, axis): @@ -202,7 +218,6 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): 0 a 1 bird polly 1 b 2 monkey george """ - # TODO: Do we really need to have different error messages for an empty # list and a list of None? if not objs: @@ -286,9 +301,12 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): if len(objs) == 0: return df - objs, match_index = _align_objs(objs, how=join) + # Don't need to align indices of all `objs` since we + # would anyway return an empty dataframe below + if not empty_inner: + objs = _align_objs(objs, how=join, sort=sort) + df.index = objs[0].index - df.index = objs[0].index for o in objs: for name, col in o._data.items(): if name in df._data: @@ -297,7 +315,15 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): f"doesn't support having multiple columns with " f"same names yet." ) - df[name] = col + if empty_inner: + # if join is inner and it contains an empty df + # we return an empty df, hence creating an empty + # column with dtype metadata retained. + df[name] = cudf.core.column.column_empty_like( + col, newsize=0 + ) + else: + df[name] = col result_columns = objs[0].columns.append( [obj.columns for obj in objs[1:]] @@ -314,20 +340,7 @@ def concat(objs, axis=0, join="outer", ignore_index=False, sort=None): # we return an empty df return df.head(0) - # This check uses `sort is not False` rather than just `sort=True` - # to differentiate between a user-provided `False` value and the - # default `None`. This is necessary for pandas compatibility, even - # though `True` and `False` are the only valid options from the user. - if not match_index and sort is not False: - return df.sort_index() - - if sort or join == "inner": - # when join='outer' and sort=False string indexes - # are returned unsorted. Everything else seems - # to be returned sorted when axis = 1 - return df.sort_index() - else: - return df + return df # If we get here, we are always concatenating along axis 0 (the rows). typ = list(typs)[0] @@ -499,7 +512,7 @@ def melt( dtypes = [frame[col].dtype for col in id_vars + value_vars] if any(cudf.api.types.is_categorical_dtype(t) for t in dtypes): raise NotImplementedError( - "Categorical columns are not yet " "supported for function" + "Categorical columns are not yet supported for function" ) # Check dtype homogeneity in value_var diff --git a/python/cudf/cudf/tests/test_concat.py b/python/cudf/cudf/tests/test_concat.py index 3c1ff4c968e..bb96f3c4290 100644 --- a/python/cudf/cudf/tests/test_concat.py +++ b/python/cudf/cudf/tests/test_concat.py @@ -229,7 +229,11 @@ def test_concat_multiindex_dataframe(): pd.concat([pdg1, pdg2]), check_index_type=True, ) - assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1)) + assert_eq( + gd.concat([gdg1, gdg2], axis=1), + pd.concat([pdg1, pdg2], axis=1), + check_index_type=True, + ) def test_concat_multiindex_series(): @@ -269,7 +273,11 @@ def test_concat_multiindex_dataframe_and_series(): pdg2.name = "a" gdg1 = gd.from_pandas(pdg1) gdg2 = gd.from_pandas(pdg2) - assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1)) + assert_eq( + gd.concat([gdg1, gdg2], axis=1), + pd.concat([pdg1, pdg2], axis=1), + check_index_type=True, + ) def test_concat_multiindex_series_and_dataframe(): @@ -288,7 +296,11 @@ def test_concat_multiindex_series_and_dataframe(): pdg1.name = "a" gdg1 = gd.from_pandas(pdg1) gdg2 = gd.from_pandas(pdg2) - assert_eq(gd.concat([gdg1, gdg2], axis=1), pd.concat([pdg1, pdg2], axis=1)) + assert_eq( + gd.concat([gdg1, gdg2], axis=1), + pd.concat([pdg1, pdg2], axis=1), + check_index_type=True, + ) @pytest.mark.parametrize("myindex", ["a", "b"]) @@ -328,7 +340,9 @@ def test_pandas_concat_compatibility_axis1(): expect = pd.concat([pd1, pd2, pd3, pd4, pd5], axis=1) got = gd.concat([d1, d2, d3, d4, d5], axis=1) - assert_eq(got, expect) + assert_eq( + got, expect, check_index_type=True, + ) @pytest.mark.parametrize("index", [[0, 1, 2], [2, 1, 0], [5, 9, 10]]) @@ -350,7 +364,7 @@ def test_pandas_concat_compatibility_axis1_overlap(index, names, data): ps2 = s2.to_pandas() got = gd.concat([s1, s2], axis=1) expect = pd.concat([ps1, ps2], axis=1) - assert_eq(got, expect) + assert_eq(got, expect, check_index_type=True) def test_pandas_concat_compatibility_axis1_eq_index(): @@ -640,10 +654,12 @@ def test_concat_dataframe_with_multiIndex(df1, df2): pdf1 = gdf1.to_pandas() pdf2 = gdf2.to_pandas() - expected = gd.concat([gdf1, gdf2], axis=1) - actual = pd.concat([pdf1, pdf2], axis=1) + actual = gd.concat([gdf1, gdf2], axis=1) + expected = pd.concat([pdf1, pdf2], axis=1) - assert_eq(expected, actual) + assert_eq( + expected, actual, check_index_type=True, + ) @pytest.mark.parametrize( @@ -761,18 +777,23 @@ def test_concat_join_axis_1_dup_error(objs): def test_concat_join_axis_1(objs, ignore_index, sort, join, axis): # no duplicate columns gpu_objs = [gd.from_pandas(o) for o in objs] - + expected = pd.concat( + objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis + ) + actual = gd.concat( + gpu_objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis, + ) + # TODO: Remove special handling below + # after following bug from pandas is fixed: + # https://github.com/pandas-dev/pandas/issues/43584 assert_eq( - pd.concat( - objs, sort=sort, join=join, ignore_index=ignore_index, axis=axis - ), - gd.concat( - gpu_objs, - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), + expected, + actual, + check_index_type=False + if sort + and isinstance(expected.index, pd.Int64Index) + and isinstance(actual.index, gd.RangeIndex) + else True, ) @@ -833,14 +854,23 @@ def test_concat_join_one_df(ignore_index, sort, join, axis): ) gdf1 = gd.from_pandas(pdf1) - + expected = pd.concat( + [pdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis + ) + actual = gd.concat( + [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis + ) + # TODO: Remove special handling below + # after following bug from pandas is fixed: + # https://github.com/pandas-dev/pandas/issues/43584 assert_eq( - pd.concat( - [pdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis - ), - gd.concat( - [gdf1], sort=sort, join=join, ignore_index=ignore_index, axis=axis - ), + expected, + actual, + check_index_type=False + if sort + and isinstance(expected.index, pd.Int64Index) + and isinstance(actual.index, gd.RangeIndex) + else True, ) @@ -870,21 +900,34 @@ def test_concat_join_no_overlapping_columns( ): gdf1 = gd.from_pandas(pdf1) gdf2 = gd.from_pandas(pdf2) + + expected = pd.concat( + [pdf1, pdf2], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = gd.concat( + [gdf1, gdf2], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + + # TODO: Remove special handling below + # after following bug from pandas is fixed: + # https://github.com/pandas-dev/pandas/issues/43584 assert_eq( - pd.concat( - [pdf1, pdf2], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - gd.concat( - [gdf1, gdf2], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), + expected, + actual, + check_index_type=False + if sort + and axis == 1 + and isinstance(expected.index, pd.Int64Index) + and isinstance(actual.index, gd.RangeIndex) + else True, ) @@ -1013,22 +1056,24 @@ def test_concat_join_no_overlapping_columns_empty_df_basic( gdf6 = gd.from_pandas(pdf6) gdf_empty = gd.from_pandas(pdf_empty) - assert_eq( - pd.concat( - [pdf6, pdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ).reset_index(drop=True), - gd.concat( - [gdf6, gdf_empty], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), + expected = pd.concat( + [pdf6, pdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = gd.concat( + [gdf6, gdf_empty], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, ) + # TODO: change `check_index_type` to `True` + # after following bug from pandas is fixed: + # https://github.com/pandas-dev/pandas/issues/43584 + assert_eq(expected, actual, check_index_type=False) @pytest.mark.parametrize("ignore_index", [True, False]) @@ -1046,21 +1091,32 @@ def test_concat_join_series(ignore_index, sort, join, axis): ps3 = s3.to_pandas() ps4 = s4.to_pandas() + expected = pd.concat( + [ps1, ps2, ps3, ps4], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + actual = gd.concat( + [s1, s2, s3, s4], + sort=sort, + join=join, + ignore_index=ignore_index, + axis=axis, + ) + + # TODO: Remove special handling below + # after following bug from pandas is fixed: + # https://github.com/pandas-dev/pandas/issues/43584 assert_eq( - gd.concat( - [s1, s2, s3, s4], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), - pd.concat( - [ps1, ps2, ps3, ps4], - sort=sort, - join=join, - ignore_index=ignore_index, - axis=axis, - ), + expected, + actual, + check_index_type=False + if sort + and isinstance(expected.index, pd.Int64Index) + and isinstance(actual.index, gd.RangeIndex) + else True, ) diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 4ae86dc1cfc..334277add91 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -2378,3 +2378,101 @@ def test_range_index_concat(objs): for obj in objs[1:]: expected = expected.append(obj) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "idx1, idx2", + [ + (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)), + (pd.RangeIndex(0, 10), pd.RangeIndex(10, 20)), + (pd.RangeIndex(0, 10, 2), pd.RangeIndex(1, 5, 3)), + (pd.RangeIndex(1, 5, 3), pd.RangeIndex(0, 10, 2)), + (pd.RangeIndex(1, 10, 3), pd.RangeIndex(1, 5, 2)), + (pd.RangeIndex(1, 5, 2), pd.RangeIndex(1, 10, 3)), + (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 3)), + (pd.RangeIndex(1, 100, 3), pd.RangeIndex(1, 50, 6)), + (pd.RangeIndex(1, 100, 6), pd.RangeIndex(1, 50, 3)), + (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")), + (pd.Index([0, 1, 2, 30], name="a"), pd.Index([90, 100])), + (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), + (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])), + (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])), + ], +) +@pytest.mark.parametrize("sort", [None, False]) +def test_union_index(idx1, idx2, sort): + expected = idx1.union(idx2, sort=sort) + + idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 + idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 + + actual = idx1.union(idx2, sort=sort) + + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "idx1, idx2", + [ + (pd.RangeIndex(0, 10), pd.RangeIndex(3, 7)), + (pd.RangeIndex(0, 10), pd.RangeIndex(-10, 20)), + (pd.RangeIndex(0, 10, name="a"), pd.RangeIndex(90, 100, name="b")), + (pd.Index([0, 1, 2, 30], name="a"), pd.Index([30, 0, 90, 100])), + (pd.Index([0, 1, 2, 30], name="a"), [90, 100]), + (pd.Index([0, 1, 2, 30]), pd.Index([0, 10, 1.0, 11])), + (pd.Index(["a", "b", "c", "d", "c"]), pd.Index(["a", "c", "z"])), + ( + pd.Index(["a", "b", "c", "d", "c"]), + pd.Index(["a", "b", "c", "d", "c"]), + ), + (pd.Index([True, False, True, True]), pd.Index([10, 11, 12, 0, 1, 2])), + (pd.Index([True, False, True, True]), pd.Index([True, True])), + ], +) +@pytest.mark.parametrize("sort", [None, False]) +def test_intersection_index(idx1, idx2, sort): + + expected = idx1.intersection(idx2, sort=sort) + + idx1 = cudf.from_pandas(idx1) if isinstance(idx1, pd.Index) else idx1 + idx2 = cudf.from_pandas(idx2) if isinstance(idx2, pd.Index) else idx2 + + actual = idx1.intersection(idx2, sort=sort) + + assert_eq(expected, actual, exact=False) + + +@pytest.mark.parametrize( + "data", + [ + [1, 2, 3], + ["a", "v", "d"], + [234.243, 2432.3, None], + [True, False, True], + pd.Series(["a", " ", "v"], dtype="category"), + pd.IntervalIndex.from_breaks([0, 1, 2, 3]), + ], +) +@pytest.mark.parametrize( + "func", + [ + "is_numeric", + "is_boolean", + "is_integer", + "is_floating", + "is_object", + "is_categorical", + "is_interval", + ], +) +def test_index_type_methods(data, func): + pidx = pd.Index(data) + gidx = cudf.from_pandas(pidx) + + expected = getattr(pidx, func)() + actual = getattr(gidx, func)() + + if gidx.dtype == np.dtype("bool") and func == "is_object": + assert_eq(False, actual) + else: + assert_eq(expected, actual) diff --git a/python/cudf/cudf/tests/test_multiindex.py b/python/cudf/cudf/tests/test_multiindex.py index e2b1d72c63e..a6d0a10ce5d 100644 --- a/python/cudf/cudf/tests/test_multiindex.py +++ b/python/cudf/cudf/tests/test_multiindex.py @@ -1574,6 +1574,101 @@ def test_difference(): assert_eq(expected, actual) +@pytest.mark.parametrize( + "idx1, idx2", + [ + ( + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 2], ["Red", "Green", "Red", "Green"]] + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], ["Red", "Blue", "Red", "Blue"]], + names=["a", "b"], + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 4], ["Red", "Green", "Red", "Green"]], + names=["x", "y"], + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 4], [0.2, 0.4, 1.4, 10], [3, 3, 2, 4]] + ), + ), + ], +) +@pytest.mark.parametrize("sort", [None, False]) +def test_union_mulitIndex(idx1, idx2, sort): + expected = idx1.union(idx2, sort=sort) + + idx1 = cudf.from_pandas(idx1) + idx2 = cudf.from_pandas(idx2) + + actual = idx1.union(idx2, sort=sort) + assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "idx1, idx2", + [ + ( + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ), + pd.MultiIndex.from_arrays( + [[1, 3, 2, 2], ["Red", "Green", "Red", "Green"]] + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], ["Red", "Blue", "Red", "Blue"]], + names=["a", "b"], + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 4], ["Red", "Green", "Red", "Green"]], + names=["x", "y"], + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[3, 3, 2, 4], [0.2, 0.4, 1.4, 10], [3, 3, 2, 4]] + ), + ), + ( + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + ), + ), + ], +) +@pytest.mark.parametrize("sort", [None, False]) +def test_intersection_mulitIndex(idx1, idx2, sort): + expected = idx1.intersection(idx2, sort=sort) + + idx1 = cudf.from_pandas(idx1) + idx2 = cudf.from_pandas(idx2) + + actual = idx1.intersection(idx2, sort=sort) + assert_eq(expected, actual, exact=False) + + @pytest.mark.parametrize( "names", [ @@ -1601,3 +1696,42 @@ def test_pickle_roundtrip_multiIndex(names): local_file.seek(0) actual_df = pickle.load(local_file) assert_eq(expected_df, actual_df) + + +@pytest.mark.parametrize( + "pidx", + [ + pd.MultiIndex.from_arrays( + [[1, 1, 2, 2], ["Red", "Blue", "Red", "Blue"]] + ), + pd.MultiIndex.from_arrays( + [[1, 2, 3, 4], [5, 6, 7, 10], [11, 12, 12, 13]], + names=["a", "b", "c"], + ), + pd.MultiIndex.from_arrays( + [[1.0, 2, 3, 4], [5, 6, 7.8, 10], [11, 12, 12, 13]], + ), + ], +) +@pytest.mark.parametrize( + "func", + [ + "is_numeric", + "is_boolean", + "is_integer", + "is_floating", + "is_object", + "is_categorical", + "is_interval", + ], +) +def test_multiIndex_type_methods(pidx, func): + gidx = cudf.from_pandas(pidx) + + expected = getattr(pidx, func)() + actual = getattr(gidx, func)() + + if func == "is_object": + assert_eq(False, actual) + else: + assert_eq(expected, actual)