diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index 48e6293c3f4..8aeaf08273f 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -172,11 +172,31 @@ def equals(self, other: ColumnBase, check_dtypes: bool = False) -> bool: def _null_equals(self, other: ColumnBase) -> ColumnBase: return self.binary_operator("NULL_EQUALS", other) - def all(self) -> bool: - return bool(libcudf.reduce.reduce("all", self, dtype=np.bool_)) + def all(self, skipna: bool = True) -> bool: + # If all entries are null the result is True, including when the column + # is empty. + result_col = self.nans_to_nulls() if skipna else self - def any(self) -> bool: - return bool(libcudf.reduce.reduce("any", self, dtype=np.bool_)) + if result_col.null_count == result_col.size: + return True + + if isinstance(result_col, ColumnBase): + return libcudf.reduce.reduce("all", result_col, dtype=np.bool_) + else: + return result_col + + def any(self, skipna: bool = True) -> bool: + # Early exit for fast cases. + result_col = self.nans_to_nulls() if skipna else self + if not skipna and result_col.has_nulls: + return True + elif skipna and result_col.null_count == result_col.size: + return False + + if isinstance(result_col, ColumnBase): + return libcudf.reduce.reduce("any", result_col, dtype=np.bool_) + else: + return result_col def __sizeof__(self) -> int: n = 0 @@ -911,9 +931,9 @@ def astype(self, dtype: Dtype, **kwargs) -> ColumnBase: return self.as_interval_column(dtype, **kwargs) elif is_decimal_dtype(dtype): return self.as_decimal_column(dtype, **kwargs) - elif np.issubdtype(dtype, np.datetime64): + elif np.issubdtype(cast(Any, dtype), np.datetime64): return self.as_datetime_column(dtype, **kwargs) - elif np.issubdtype(dtype, np.timedelta64): + elif np.issubdtype(cast(Any, dtype), np.timedelta64): return self.as_timedelta_column(dtype, **kwargs) else: return self.as_numerical_column(dtype, **kwargs) diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py index bc068413efb..8cdc6eebaee 100644 --- a/python/cudf/cudf/core/dataframe.py +++ b/python/cudf/cudf/core/dataframe.py @@ -6263,7 +6263,8 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): Single 5 dtype: int64 """ - if axis not in (0, "index", None): + axis = self._get_axis_from_axis_arg(axis) + if axis != 0: raise NotImplementedError("Only axis=0 is currently supported.") return self._apply_support_method( @@ -6274,268 +6275,37 @@ def count(self, axis=0, level=None, numeric_only=False, **kwargs): **kwargs, ) - def min( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, - ): - """ - Return the minimum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. + _SUPPORT_AXIS_LOOKUP = { + 0: 0, + 1: 1, + None: 0, + "index": 0, + "columns": 1, + } - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.min() - a 1 - b 7 - dtype: int64 - """ - return self._apply_support_method( - "min", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def max( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + def _reduce( + self, op, axis=None, level=None, numeric_only=None, **kwargs, ): - """ - Return the maximum of the values in the DataFrame. - - Parameters - ---------- - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - level: int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only: bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.max() - a 4 - b 10 - dtype: int64 - """ - return self._apply_support_method( - "max", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - - def sum( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return sum of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.sum() - a 10 - b 34 - dtype: int64 - """ - return self._apply_support_method( - "sum", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - def product( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.product() - a 24 - b 5040 - dtype: int64 - """ - return self._apply_support_method( - "prod", - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) - - def prod( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the DataFrame. - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values when computing the result. - dtype: data type - Data type to cast the result to. - min_count: int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. + if level is not None: + raise NotImplementedError("level parameter is not implemented yet") - Returns - ------- - scalar + if numeric_only not in (None, True): + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + axis = self._get_axis_from_axis_arg(axis) - Notes - ----- - Parameters currently not supported are `level`, `numeric_only`. + if axis == 0: + result = [ + getattr(self._data[col], op)(**kwargs) + for col in self._data.names + ] - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.prod() - a 24 - b 5040 - dtype: int64 - """ - return self.product( - axis=axis, - skipna=skipna, - dtype=dtype, - level=level, - numeric_only=numeric_only, - min_count=min_count, - **kwargs, - ) + return Series._from_data( + {None: result}, as_index(self._data.names) + ) + elif axis == 1: + return self._apply_support_method_axis_1(op, **kwargs) def cummin(self, axis=None, skipna=True, *args, **kwargs): """ @@ -6686,50 +6456,6 @@ def cumprod(self, axis=None, skipna=True, *args, **kwargs): "cumprod", axis=axis, skipna=skipna, *args, **kwargs ) - def mean( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the mean of the values for the requested axis. - - Parameters - ---------- - axis : {0 or 'index', 1 or 'columns'} - Axis for the function to be applied on. - skipna : bool, default True - Exclude NA/null values when computing the result. - level : int or level name, default None - If the axis is a MultiIndex (hierarchical), count along a - particular level, collapsing into a Series. - numeric_only : bool, default None - Include only float, int, boolean columns. If None, will attempt to - use everything, then use only numeric data. Not implemented for - Series. - **kwargs - Additional keyword arguments to be passed to the function. - - Returns - ------- - mean : Series or DataFrame (if level specified) - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.mean() - a 2.5 - b 8.5 - dtype: float64 - """ - return self._apply_support_method( - "mean", - axis=axis, - skipna=skipna, - level=level, - numeric_only=numeric_only, - **kwargs, - ) - def mode(self, axis=0, numeric_only=False, dropna=True): """ Get the mode(s) of each element along the selected axis. @@ -6830,117 +6556,6 @@ def mode(self, axis=0, numeric_only=False, dropna=True): return df - def std( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return sample standard deviation of the DataFrame. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.std() - a 1.290994 - b 1.290994 - dtype: float64 - """ - - return self._apply_support_method( - "std", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - - def var( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return unbiased variance of the DataFrame. - - Normalized by N-1 by default. This can be changed using the - ddof argument - - Parameters - ---------- - - axis: {index (0), columns(1)} - Axis for the function to be applied on. - skipna: bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - ddof: int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) - >>> df.var() - a 1.666667 - b 1.666667 - dtype: float64 - """ - return self._apply_support_method( - "var", - axis=axis, - skipna=skipna, - level=level, - ddof=ddof, - numeric_only=numeric_only, - **kwargs, - ) - def kurtosis( self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs ): @@ -7041,213 +6656,126 @@ def skew( ) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether all elements are True in DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be True, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - Series - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.all() - a True - b False - dtype: bool - """ - if bool_only: - return self.select_dtypes(include="bool")._apply_support_method( - "all", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) - return self._apply_support_method( - "all", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) + obj = self.select_dtypes(include="bool") if bool_only else self + return super(DataFrame, obj).all(axis, skipna, level, **kwargs) def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether any elements is True in DataFrame. - - Parameters - ---------- - - skipna: bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be False, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. + obj = self.select_dtypes(include="bool") if bool_only else self + return super(DataFrame, obj).any(axis, skipna, level, **kwargs) - Returns - ------- - Series + def _apply_support_method_axis_0(self, method, *args, **kwargs): + result = [ + getattr(self[col], method)(*args, **kwargs) + for col in self._data.names + ] - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. + if isinstance(result[0], Series): + support_result = result + result = DataFrame(index=support_result[0].index) + for idx, col in enumerate(self._data.names): + result[col] = support_result[idx] + else: + result = Series(result) + result = result.set_index(self._data.names) + return result - Examples - -------- - >>> import cudf - >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) - >>> df.any() - a True - b True - dtype: bool - """ - if bool_only: - return self.select_dtypes(include="bool")._apply_support_method( - "any", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, + def _apply_support_method_axis_1(self, method, *args, **kwargs): + # for dask metadata compatibility + skipna = kwargs.pop("skipna", None) + if method not in _cupy_nan_methods_map and skipna not in ( + None, + True, + 1, + ): + raise NotImplementedError( + f"Row-wise operation to calculate '{method}'" + f" currently do not support `skipna=False`." ) - return self._apply_support_method( - "any", - axis=axis, - bool_only=bool_only, - skipna=skipna, - level=level, - **kwargs, - ) - - def _apply_support_method(self, method, axis=0, *args, **kwargs): - assert axis in (None, 0, 1) - if axis in (None, 0): - result = [ - getattr(self[col], method)(*args, **kwargs) - for col in self._data.names - ] + level = kwargs.pop("level", None) + if level not in (None,): + raise NotImplementedError( + "Row-wise operations currently do not support `level`." + ) - if isinstance(result[0], Series): - support_result = result - result = DataFrame(index=support_result[0].index) - for idx, col in enumerate(self._data.names): - result[col] = support_result[idx] - else: - result = Series(result) - result = result.set_index(self._data.names) - return result + numeric_only = kwargs.pop("numeric_only", None) + if numeric_only not in (None, True): + raise NotImplementedError( + "Row-wise operations currently do not " + "support `numeric_only=False`." + ) - elif axis == 1: - # for dask metadata compatibility - skipna = kwargs.pop("skipna", None) - if method not in _cupy_nan_methods_map and skipna not in ( - None, - True, - 1, - ): - raise NotImplementedError( - f"Row-wise operation to calculate '{method}'" - f" currently do not support `skipna=False`." - ) + min_count = kwargs.pop("min_count", None) + if min_count not in (None, 0): + raise NotImplementedError( + "Row-wise operations currently do not " "support `min_count`." + ) - level = kwargs.pop("level", None) - if level not in (None,): - raise NotImplementedError( - "Row-wise operations currently do not support `level`." - ) + bool_only = kwargs.pop("bool_only", None) + if bool_only not in (None, True): + raise NotImplementedError( + "Row-wise operations currently do not " "support `bool_only`." + ) - numeric_only = kwargs.pop("numeric_only", None) - if numeric_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `numeric_only=False`." + prepared, mask, common_dtype = self._prepare_for_rowwise_op( + method, skipna + ) + for col in prepared._data.names: + if prepared._data[col].nullable: + prepared._data[col] = ( + prepared._data[col] + .astype( + cudf.utils.dtypes.get_min_float_dtype( + prepared._data[col] + ) + if not is_datetime_dtype(common_dtype) + else np.dtype("float64") + ) + .fillna(np.nan) ) - - min_count = kwargs.pop("min_count", None) - if min_count not in (None, 0): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `min_count`." + arr = cupy.asarray(prepared.as_gpu_matrix()) + + if skipna is not False and method in _cupy_nan_methods_map: + method = _cupy_nan_methods_map[method] + + result = getattr(cupy, method)(arr, axis=1, **kwargs) + + if result.ndim == 1: + type_coerced_methods = { + "count", + "min", + "max", + "sum", + "prod", + "cummin", + "cummax", + "cumsum", + "cumprod", + } + result_dtype = ( + common_dtype + if method in type_coerced_methods + or is_datetime_dtype(common_dtype) + else None + ) + result = column.as_column(result, dtype=result_dtype) + if mask is not None: + result = result.set_mask( + cudf._lib.transform.bools_to_mask(mask._column) ) + return Series(result, index=self.index, dtype=result_dtype,) + else: + result_df = DataFrame(result).set_index(self.index) + result_df.columns = prepared.columns + return result_df - bool_only = kwargs.pop("bool_only", None) - if bool_only not in (None, True): - raise NotImplementedError( - "Row-wise operations currently do not " - "support `bool_only`." - ) + def _apply_support_method(self, method, axis=0, *args, **kwargs): + axis = self._get_axis_from_axis_arg(axis) - prepared, mask, common_dtype = self._prepare_for_rowwise_op( - method, skipna - ) - for col in prepared._data.names: - if prepared._data[col].nullable: - prepared._data[col] = ( - prepared._data[col] - .astype( - cudf.utils.dtypes.get_min_float_dtype( - prepared._data[col] - ) - if not is_datetime_dtype(common_dtype) - else np.dtype("float64") - ) - .fillna(np.nan) - ) - arr = cupy.asarray(prepared.as_gpu_matrix()) - - if skipna is not False and method in _cupy_nan_methods_map: - method = _cupy_nan_methods_map[method] - - result = getattr(cupy, method)(arr, axis=1, **kwargs) - - if result.ndim == 1: - type_coerced_methods = { - "count", - "min", - "max", - "sum", - "prod", - "cummin", - "cummax", - "cumsum", - "cumprod", - } - result_dtype = ( - common_dtype - if method in type_coerced_methods - or is_datetime_dtype(common_dtype) - else None - ) - result = column.as_column(result, dtype=result_dtype) - if mask is not None: - result = result.set_mask( - cudf._lib.transform.bools_to_mask(mask._column) - ) - return Series(result, index=self.index, dtype=result_dtype,) - else: - result_df = DataFrame(result).set_index(self.index) - result_df.columns = prepared.columns - return result_df + if axis == 0: + return self._apply_support_method_axis_0(method, *args, **kwargs) + elif axis == 1: + return self._apply_support_method_axis_1(method, *args, **kwargs) def _columns_view(self, columns): """ diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 14b8ebe801f..6a976f54c2b 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -3604,6 +3604,530 @@ def __pos__(self): def __abs__(self): return self._unaryop("abs") + # Reductions + @classmethod + def _get_axis_from_axis_arg(cls, axis): + try: + return cls._SUPPORT_AXIS_LOOKUP[axis] + except KeyError: + valid_axes = ", ".join( + ( + ax + for ax in cls._SUPPORT_AXIS_LOOKUP.keys() + if ax is not None + ) + ) + raise ValueError(f"Invalid axis, must be one of {valid_axes}.") + + def _reduce(self, *args, **kwargs): + raise NotImplementedError( + f"Reductions are not supported for objects of type {type(self)}." + ) + + def min( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + ): + """ + Return the minimum of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + level: int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only: bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.min() + a 1 + b 7 + dtype: int64 + """ + return self._reduce( + "min", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def max( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs, + ): + """ + Return the maximum of the values in the DataFrame. + + Parameters + ---------- + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + level: int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only: bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.max() + a 4 + b 10 + dtype: int64 + """ + return self._reduce( + "max", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def sum( + self, + axis=None, + skipna=None, + dtype=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + """ + Return sum of the values in the DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.sum() + a 10 + b 34 + dtype: int64 + """ + return self._reduce( + "sum", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + def product( + self, + axis=None, + skipna=None, + dtype=None, + level=None, + numeric_only=None, + min_count=0, + **kwargs, + ): + """ + Return product of the values in the DataFrame. + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values when computing the result. + dtype: data type + Data type to cast the result to. + min_count: int, default 0 + The required number of valid values to perform the operation. + If fewer than min_count non-NA values are present the result + will be NA. + + The default being 0. This means the sum of an all-NA or empty + Series is 0, and the product of an all-NA or empty Series is 1. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are level`, `numeric_only`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.product() + a 24 + b 5040 + dtype: int64 + """ + axis = self._get_axis_from_axis_arg(axis) + return self._reduce( + # cuDF columns use "product" as the op name, but cupy uses "prod" + # and we need cupy if axis == 1. + "product" if axis == 0 else "prod", + axis=axis, + skipna=skipna, + dtype=dtype, + level=level, + numeric_only=numeric_only, + min_count=min_count, + **kwargs, + ) + + # Alias for pandas compatibility. + prod = product + + def mean( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return the mean of the values for the requested axis. + + Parameters + ---------- + axis : {0 or 'index', 1 or 'columns'} + Axis for the function to be applied on. + skipna : bool, default True + Exclude NA/null values when computing the result. + level : int or level name, default None + If the axis is a MultiIndex (hierarchical), count along a + particular level, collapsing into a Series. + numeric_only : bool, default None + Include only float, int, boolean columns. If None, will attempt to + use everything, then use only numeric data. Not implemented for + Series. + **kwargs + Additional keyword arguments to be passed to the function. + + Returns + ------- + mean : Series or DataFrame (if level specified) + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.mean() + a 2.5 + b 8.5 + dtype: float64 + """ + return self._reduce( + "mean", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + + def std( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + """ + Return sample standard deviation of the DataFrame. + + Normalized by N-1 by default. This can be changed using + the `ddof` argument + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations + is N - ddof, where N represents the number of elements. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `level` and + `numeric_only` + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.std() + a 1.290994 + b 1.290994 + dtype: float64 + """ + + return self._reduce( + "std", + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + def var( + self, + axis=None, + skipna=None, + level=None, + ddof=1, + numeric_only=None, + **kwargs, + ): + """ + Return unbiased variance of the DataFrame. + + Normalized by N-1 by default. This can be changed using the + ddof argument + + Parameters + ---------- + + axis: {index (0), columns(1)} + Axis for the function to be applied on. + skipna: bool, default True + Exclude NA/null values. If an entire row/column is NA, the result + will be NA. + ddof: int, default 1 + Delta Degrees of Freedom. The divisor used in calculations is + N - ddof, where N represents the number of elements. + + Returns + ------- + scalar + + Notes + ----- + Parameters currently not supported are `level` and + `numeric_only` + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [1, 2, 3, 4], 'b': [7, 8, 9, 10]}) + >>> df.var() + a 1.666667 + b 1.666667 + dtype: float64 + """ + return self._reduce( + "var", + axis=axis, + skipna=skipna, + level=level, + ddof=ddof, + numeric_only=numeric_only, + **kwargs, + ) + + def all(self, axis=0, skipna=True, level=None, **kwargs): + """ + Return whether all elements are True in DataFrame. + + Parameters + ---------- + + skipna: bool, default True + Exclude NA/null values. If the entire row/column is NA and + skipna is True, then the result will be True, as for an + empty row/column. + If skipna is False, then NA are treated as True, because + these are not equal to zero. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `axis`, `bool_only`, `level`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.all() + a True + b False + dtype: bool + """ + return self._reduce( + "all", axis=axis, skipna=skipna, level=level, **kwargs, + ) + + def any(self, axis=0, skipna=True, level=None, **kwargs): + """ + Return whether any elements is True in DataFrame. + + Parameters + ---------- + + skipna: bool, default True + Exclude NA/null values. If the entire row/column is NA and + skipna is True, then the result will be False, as for an + empty row/column. + If skipna is False, then NA are treated as True, because + these are not equal to zero. + + Returns + ------- + Series + + Notes + ----- + Parameters currently not supported are `axis`, `bool_only`, `level`. + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.any() + a True + b True + dtype: bool + """ + return self._reduce( + "any", axis=axis, skipna=skipna, level=level, **kwargs, + ) + + def sum_of_squares(self, dtype=None): + """Return the sum of squares of values. + + Parameters + ---------- + dtype: data type + Data type to cast the result to. + + Returns + ------- + Series + + Examples + -------- + >>> import cudf + >>> df = cudf.DataFrame({'a': [3, 2, 3, 4], 'b': [7, 0, 10, 10]}) + >>> df.sum_of_squares() + a 38 + b 249 + dtype: int64 + """ + return self._reduce("sum_of_squares", dtype=dtype) + + def median( + self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs + ): + """ + Return the median of the values for the requested axis. + + Parameters + ---------- + + skipna : bool, default True + Exclude NA/null values when computing the result. + + Returns + ------- + scalar + + Notes + ----- + Parameters currently not supported are `level` and `numeric_only`. + + Examples + -------- + >>> import cudf + >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) + >>> ser + 0 10 + 1 25 + 2 3 + 3 25 + 4 24 + 5 6 + dtype: int64 + >>> ser.median() + 17.0 + """ + return self._reduce( + "median", + axis=axis, + skipna=skipna, + level=level, + numeric_only=numeric_only, + **kwargs, + ) + class SingleColumnFrame(Frame): """A one-dimensional frame. @@ -3612,6 +4136,27 @@ class SingleColumnFrame(Frame): this class. """ + _SUPPORT_AXIS_LOOKUP = { + 0: 0, + None: 0, + "index": 0, + } + + def _reduce( + self, op, axis=None, level=None, numeric_only=None, **kwargs, + ): + if axis not in (None, 0): + raise NotImplementedError("axis parameter is not implemented yet") + + if level is not None: + raise NotImplementedError("level parameter is not implemented yet") + + if numeric_only not in (None, True): + raise NotImplementedError( + "numeric_only parameter is not implemented yet" + ) + return getattr(self._column, op)(**kwargs) + @classmethod def _from_data( cls, diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py index e9ab3d5797c..a2f13daf44c 100644 --- a/python/cudf/cudf/core/index.py +++ b/python/cudf/cudf/core/index.py @@ -518,74 +518,6 @@ def gpu_values(self): """ return self._values.data_array_view - def min(self): - """ - Return the minimum value of the Index. - - Returns - ------- - scalar - Minimum value. - - See Also - -------- - cudf.core.index.Index.max : Return the maximum value in an Index. - cudf.core.series.Series.min : Return the minimum value in a Series. - cudf.core.dataframe.DataFrame.min : Return the minimum values in - a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.min() - 1 - """ - return self._values.min() - - def max(self): - """ - Return the maximum value of the Index. - - Returns - ------- - scalar - Maximum value. - - See Also - -------- - cudf.core.index.Index.min : Return the minimum value in an Index. - cudf.core.series.Series.max : Return the maximum value in a Series. - cudf.core.dataframe.DataFrame.max : Return the maximum values in - a DataFrame. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.max() - 3 - """ - return self._values.max() - - def sum(self): - """ - Return the sum of all values of the Index. - - Returns - ------- - scalar - Sum of all values. - - Examples - -------- - >>> import cudf - >>> idx = cudf.Index([3, 2, 1]) - >>> idx.sum() - 6 - """ - return self._values.sum() - @classmethod def _concat(cls, objs): if all(isinstance(obj, RangeIndex) for obj in objs): diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py index db88e3f7620..cb7a82bd4c8 100644 --- a/python/cudf/cudf/core/series.py +++ b/python/cudf/cudf/core/series.py @@ -2726,109 +2726,18 @@ def nans_to_nulls(self): return self._copy_construct(data=result_col) def all(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether all elements are True in Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be True, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.all() - True - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - if bool_only not in (None, True): raise NotImplementedError( - "bool_only parameter is not implemented yet" + "The bool_only parameter is not supported for Series." ) - - if skipna: - result_series = self.nans_to_nulls() - if len(result_series) == result_series.null_count: - return True - else: - result_series = self - return result_series._column.all() + return super().all(axis, skipna, level, **kwargs) def any(self, axis=0, bool_only=None, skipna=True, level=None, **kwargs): - """ - Return whether any elements is True in Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If the entire row/column is NA and - skipna is True, then the result will be False, as for an - empty row/column. - If skipna is False, then NA are treated as True, because - these are not equal to zero. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `bool_only`, `level`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.any() - True - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - if bool_only not in (None, True): raise NotImplementedError( - "bool_only parameter is not implemented yet" + "The bool_only parameter is not supported for Series." ) - - skipna = False if skipna is None else skipna - - if skipna is False and self.has_nulls: - return True - - if skipna: - result_series = self.nans_to_nulls() - if len(result_series) == result_series.null_count: - return False - - else: - result_series = self - - return result_series._column.any() + return super().any(axis, skipna, level, **kwargs) def to_pandas(self, index=True, nullable=False, **kwargs): """ @@ -4021,230 +3930,6 @@ def count(self, level=None, **kwargs): return self.valid_count - def min( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - **kwargs, - ): - """ - Return the minimum of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.min() - 1 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.min(skipna=skipna, dtype=dtype) - - def max( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - **kwargs, - ): - """ - Return the maximum of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.max() - 5 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.max(skipna=skipna, dtype=dtype) - - def sum( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return sum of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - min_count : int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.sum() - 15 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.sum( - skipna=skipna, dtype=dtype, min_count=min_count - ) - - def product( - self, - axis=None, - skipna=None, - dtype=None, - level=None, - numeric_only=None, - min_count=0, - **kwargs, - ): - """ - Return product of the values in the Series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - dtype : data type - Data type to cast the result to. - - min_count : int, default 0 - The required number of valid values to perform the operation. - If fewer than min_count non-NA values are present the result - will be NA. - - The default being 0. This means the sum of an all-NA or empty - Series is 0, and the product of an all-NA or empty Series is 1. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level`, `numeric_only`. - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([1, 5, 2, 4, 3]) - >>> ser.product() - 120 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.product( - skipna=skipna, dtype=dtype, min_count=min_count - ) - - prod = product - def cummin(self, axis=None, skipna=True, *args, **kwargs): """ Return cumulative minimum of the Series. @@ -4327,8 +4012,6 @@ def cummax(self, axis=0, skipna=True, *args, **kwargs): 3 5 4 5 """ - assert axis in (None, 0) - if axis not in (None, 0): raise NotImplementedError("axis parameter is not implemented yet") @@ -4479,228 +4162,6 @@ def cumprod(self, axis=0, skipna=True, *args, **kwargs): index=self.index, ) - def mean( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the mean of the values in the series. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser.mean() - 15.5 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.mean(skipna=skipna) - - def std( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return sample standard deviation of the Series. - - Normalized by N-1 by default. This can be changed using - the `ddof` argument - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations - is N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 10, 20, 30, 40]) - >>> series - 0 10 - 1 10 - 2 20 - 3 30 - 4 40 - dtype: int64 - >>> series.std() - 13.038404810405298 - >>> series.std(ddof=2) - 15.05545305418162 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.std(skipna=skipna, ddof=ddof) - - def var( - self, - axis=None, - skipna=None, - level=None, - ddof=1, - numeric_only=None, - **kwargs, - ): - """ - Return unbiased variance of the Series. - - Normalized by N-1 by default. This can be changed using the - ddof argument - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values. If an entire row/column is NA, the result - will be NA. - - ddof : int, default 1 - Delta Degrees of Freedom. The divisor used in calculations is - N - ddof, where N represents the number of elements. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> series = cudf.Series([10, 11, 12, 0, 1]) - >>> series - 0 10 - 1 11 - 2 12 - 3 0 - 4 1 - dtype: int64 - >>> series.var() - 33.7 - """ - - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.var(skipna=skipna, ddof=ddof) - - def sum_of_squares(self, dtype=None): - return self._column.sum_of_squares(dtype=dtype) - - def median( - self, axis=None, skipna=None, level=None, numeric_only=None, **kwargs - ): - """ - Return the median of the values for the requested axis. - - Parameters - ---------- - - skipna : bool, default True - Exclude NA/null values when computing the result. - - Returns - ------- - scalar - - Notes - ----- - Parameters currently not supported are `axis`, `level` and - `numeric_only` - - Examples - -------- - >>> import cudf - >>> ser = cudf.Series([10, 25, 3, 25, 24, 6]) - >>> ser - 0 10 - 1 25 - 2 3 - 3 25 - 4 24 - 5 6 - dtype: int64 - >>> ser.median() - 17.0 - """ - if axis not in (None, 0): - raise NotImplementedError("axis parameter is not implemented yet") - - if level is not None: - raise NotImplementedError("level parameter is not implemented yet") - - if numeric_only not in (None, True): - raise NotImplementedError( - "numeric_only parameter is not implemented yet" - ) - - return self._column.median(skipna=skipna) - def mode(self, dropna=True): """ Return the mode(s) of the dataset. @@ -4957,7 +4418,11 @@ def corr(self, other, method="pearson", min_periods=None): -0.20454263717316112 """ - assert method in ("pearson",) and min_periods in (None,) + if method not in ("pearson",): + raise ValueError(f"Unknown method {method}") + + if min_periods not in (None,): + raise NotImplementedError("Unsupported argument 'min_periods'") if self.empty or other.empty: return cudf.utils.dtypes._get_nan_for_dtype(self.dtype) @@ -5406,7 +4871,8 @@ def hash_encode(self, stop, use_name=False): 2 76 dtype: int32 """ - assert stop > 0 + if not stop > 0: + raise ValueError("stop must be a positive integer.") initial_hash = [hash(self.name) & 0xFFFFFFFF] if use_name else None hashed_values = Series(self._hash(initial_hash)) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index 21683d4bdd0..76d24dcd5d2 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -1847,6 +1847,7 @@ def gdf(pdf): lambda df, **kwargs: df.cumsum(**kwargs), lambda df, **kwargs: df.cumprod(**kwargs), lambda df, **kwargs: df.mean(**kwargs), + lambda df, **kwargs: df.median(**kwargs), lambda df, **kwargs: df.sum(**kwargs), lambda df, **kwargs: df.max(**kwargs), lambda df, **kwargs: df.std(ddof=1, **kwargs), @@ -3423,8 +3424,6 @@ def test_all(data): expected = pdata.all(bool_only=True) assert_eq(got, expected) else: - with pytest.raises(NotImplementedError): - gdata.all(bool_only=False) with pytest.raises(NotImplementedError): gdata.all(level="a") @@ -3484,8 +3483,6 @@ def test_any(data, axis): expected = pdata.any(bool_only=True) assert_eq(got, expected) else: - with pytest.raises(NotImplementedError): - gdata.any(bool_only=False) with pytest.raises(NotImplementedError): gdata.any(level="a") diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py index 3f58eb3d6e7..38b924006bf 100644 --- a/python/cudf/cudf/tests/test_index.py +++ b/python/cudf/cudf/tests/test_index.py @@ -125,7 +125,16 @@ def test_index_comparision(): @pytest.mark.parametrize( - "func", [lambda x: x.min(), lambda x: x.max(), lambda x: x.sum()] + "func", + [ + lambda x: x.min(), + lambda x: x.max(), + lambda x: x.sum(), + lambda x: x.mean(), + lambda x: x.any(), + lambda x: x.all(), + lambda x: x.prod(), + ], ) def test_reductions(func): x = np.asarray([4, 5, 6, 10]) diff --git a/python/cudf/cudf/tests/test_reductions.py b/python/cudf/cudf/tests/test_reductions.py index 7cbc56f943c..2a45c75f6da 100644 --- a/python/cudf/cudf/tests/test_reductions.py +++ b/python/cudf/cudf/tests/test_reductions.py @@ -110,20 +110,25 @@ def test_sum_of_squares(dtype, nelem): dtype = np.dtype(dtype).type data = gen_rand(dtype, nelem) sr = Series(data) + df = cudf.DataFrame(sr) got = sr.sum_of_squares() - # got = dtype(got) + got_df = df.sum_of_squares() expect = (data ** 2).sum() if np.dtype(dtype).kind in {"u", "i"}: if 0 <= expect <= np.iinfo(dtype).max: np.testing.assert_array_almost_equal(expect, got) + np.testing.assert_array_almost_equal(expect, got_df.iloc[0]) else: print("overflow, passing") else: np.testing.assert_approx_equal( expect, got, significant=accuracy_for_dtype[dtype] ) + np.testing.assert_approx_equal( + expect, got_df.iloc[0], significant=accuracy_for_dtype[dtype] + ) @pytest.mark.parametrize(