Introduce a common parent class for NumericalColumn and DecimalColumn (…

…#8278) Adds a common parent class for NumericalColumn and DecimalColumn that implements the common operations (mainly reductions and scans). As part of enabling this, the _copy_type_metadata method was refactored to have different column types override it to perform column-specific tasks (necessary because DecimalColumn has some special handling), which should also provide some marginal performance improvements for methods that copy metadata. This PR (and some to follow) are designed to help simplify the ongoing [Array refactoring](#8273). Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Keith Kraus (https://github.com/kkraus14) - GALI PREM SAGAR (https://github.com/galipremsagar) URL: #8278
rapidsai · May 26, 2021 · cfc7ef9 · cfc7ef9
1 parent e97fc1c
commit cfc7ef9
Show file tree

Hide file tree

Showing 8 changed files with 282 additions and 345 deletions.
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
@@ -1506,6 +1506,28 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn:
             offset=codes_col.offset,
         )
 
+    def _copy_type_metadata(
+        self: CategoricalColumn, other: ColumnBase
+    ) -> ColumnBase:
+        """Copies type metadata from self onto other, returning a new column.
+
+        In addition to the default behavior, if `other` is not a
+        CategoricalColumn, we assume other is a column of codes, and return a
+        CategoricalColumn composed of `other`  and the categories of `self`.
+        """
+        if not isinstance(other, cudf.core.column.CategoricalColumn):
+            other = column.build_categorical_column(
+                categories=self.categories,
+                codes=column.as_column(other.base_data, dtype=other.dtype),
+                mask=other.base_mask,
+                ordered=self.ordered,
+                size=other.size,
+                offset=other.offset,
+                null_count=other.null_count,
+            )
+        # Have to ignore typing here because it misdiagnoses super().
+        return super()._copy_type_metadata(other)  # type: ignore
+
 
 def _create_empty_categorical_column(
     categorical_column: CategoricalColumn, dtype: "CategoricalDtype"

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
@@ -8,7 +8,6 @@
 from types import SimpleNamespace
 from typing import (
     Any,
-    Callable,
     Dict,
     List,
     MutableSequence,
@@ -310,16 +309,6 @@ def _memory_usage(self, **kwargs) -> int:
     def default_na_value(self) -> Any:
         raise NotImplementedError()
 
-    def applymap(
-        self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None
-    ) -> ColumnBase:
-        """Apply an element-wise function to the values in the Column."""
-        # Subclasses that support applymap must override this behavior.
-        raise TypeError(
-            "User-defined functions are currently not supported on data "
-            f"with dtype {self.dtype}."
-        )
-
     def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
         """Get a dense numba device array for the data.
 
@@ -1139,6 +1128,11 @@ def binary_operator(
             f"{other.dtype}."
         )
 
+    def normalize_binop_value(
+        self, other: ScalarLike
+    ) -> Union[ColumnBase, ScalarLike]:
+        raise NotImplementedError
+
     def min(self, skipna: bool = None, dtype: Dtype = None):
         result_col = self._process_for_reduction(skipna=skipna)
         if isinstance(result_col, ColumnBase):
@@ -1273,46 +1267,18 @@ def scatter_to_table(
             }
         )
 
-    def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
+    def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
         """
         Copies type metadata from self onto other, returning a new column.
 
-        * when `self` is a CategoricalColumn and `other` is not, we assume
-          other is a column of codes, and return a CategoricalColumn composed
-          of `other`  and the categories of `self`.
-        * when both `self` and `other` are StructColumns, rename the fields
-          of `other` to the field names of `self`.
-        * when both `self` and `other` are DecimalColumns, copy the precision
-          from self.dtype to other.dtype
         * when `self` and `other` are nested columns of the same type,
           recursively apply this function on the children of `self` to the
           and the children of `other`.
         * if none of the above, return `other` without any changes
         """
-        if isinstance(self, cudf.core.column.CategoricalColumn) and not (
-            isinstance(other, cudf.core.column.CategoricalColumn)
-        ):
-            other = build_categorical_column(
-                categories=self.categories,
-                codes=as_column(other.base_data, dtype=other.dtype),
-                mask=other.base_mask,
-                ordered=self.ordered,
-                size=other.size,
-                offset=other.offset,
-                null_count=other.null_count,
-            )
-
-        if isinstance(other, cudf.core.column.StructColumn) and isinstance(
-            self, cudf.core.column.StructColumn
-        ):
-            other = other._rename_fields(self.dtype.fields.keys())
-
-        if isinstance(other, cudf.core.column.DecimalColumn) and isinstance(
-            self, cudf.core.column.DecimalColumn
-        ):
-            other.dtype.precision = self.dtype.precision
-
-        if type(self) is type(other):
+        # TODO: This logic should probably be moved to a common nested column
+        # class.
+        if isinstance(other, type(self)):
             if self.base_children and other.base_children:
                 base_children = tuple(
                     self.base_children[i]._copy_type_metadata(

diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
@@ -1,7 +1,6 @@
 # Copyright (c) 2021, NVIDIA CORPORATION.
 
 from decimal import Decimal
-from numbers import Number
 from typing import Any, Sequence, Tuple, Union, cast
 
 import cupy as cp
@@ -22,11 +21,16 @@
 from cudf.utils.dtypes import is_scalar
 from cudf.utils.utils import pa_mask_buffer_to_mask
 
+from .numerical_base import NumericalBaseColumn
 
-class DecimalColumn(ColumnBase):
+
+class DecimalColumn(NumericalBaseColumn):
     dtype: Decimal64Dtype
 
     def __truediv__(self, other):
+        # TODO: This override is not sufficient. While it will change the
+        # behavior of x / y for two decimal columns, it will not affect
+        # col1.binary_operator(col2), which is how Series/Index will call this.
         return self.binary_operator("div", other)
 
     def __setitem__(self, key, value):
@@ -123,39 +127,6 @@ def normalize_binop_value(self, other):
         else:
             raise TypeError(f"cannot normalize {type(other)}")
 
-    def _apply_scan_op(self, op: str) -> ColumnBase:
-        result = libcudf.reduce.scan(op, self, True)
-        return self._copy_type_metadata(result)
-
-    def quantile(
-        self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
-    ) -> ColumnBase:
-        if isinstance(q, Number) or cudf.utils.dtypes.is_list_like(q):
-            np_array_q = np.asarray(q)
-            if np.logical_or(np_array_q < 0, np_array_q > 1).any():
-                raise ValueError(
-                    "percentiles should all be in the interval [0, 1]"
-                )
-        # Beyond this point, q either being scalar or list-like
-        # will only have values in range [0, 1]
-        result = self._decimal_quantile(q, interpolation, exact)
-        if isinstance(q, Number):
-            return (
-                cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-                if result[0] is cudf.NA
-                else result[0]
-            )
-        return result
-
-    def median(self, skipna: bool = None) -> ColumnBase:
-        skipna = True if skipna is None else skipna
-
-        if not skipna and self.has_nulls:
-            return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
-
-        # enforce linear in case the default ever changes
-        return self.quantile(0.5, interpolation="linear", exact=True)
-
     def _decimal_quantile(
         self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
     ) -> ColumnBase:
@@ -194,37 +165,6 @@ def as_string_column(
                 "cudf.core.column.StringColumn", as_column([], dtype="object")
             )
 
-    def reduce(self, op: str, skipna: bool = None, **kwargs) -> Decimal:
-        min_count = kwargs.pop("min_count", 0)
-        preprocessed = self._process_for_reduction(
-            skipna=skipna, min_count=min_count
-        )
-        if isinstance(preprocessed, ColumnBase):
-            return libcudf.reduce.reduce(op, preprocessed, **kwargs)
-        else:
-            return preprocessed
-
-    def sum(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> Decimal:
-        return self.reduce(
-            "sum", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def product(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> Decimal:
-        return self.reduce(
-            "product", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
-    def sum_of_squares(
-        self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
-    ) -> Decimal:
-        return self.reduce(
-            "sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
-        )
-
     def fillna(
         self, value: Any = None, method: str = None, dtype: Dtype = None
     ):
@@ -269,6 +209,17 @@ def __cuda_array_interface__(self):
             "Decimals are not yet supported via `__cuda_array_interface__`"
         )
 
+    def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
+        """Copies type metadata from self onto other, returning a new column.
+
+        In addition to the default behavior, if `other` is also a decimal
+        column the precision is copied over.
+        """
+        if isinstance(other, DecimalColumn):
+            other.dtype.precision = self.dtype.precision  # type: ignore
+        # Have to ignore typing here because it misdiagnoses super().
+        return super()._copy_type_metadata(other)  # type: ignore
+
 
 def _binop_scale(l_dtype, r_dtype, op):
     # This should at some point be hooked up to libcudf's