Skip to content

Commit

Permalink
Introduce a common parent class for NumericalColumn and DecimalColumn (
Browse files Browse the repository at this point in the history
…#8278)

Adds a common parent class for NumericalColumn and DecimalColumn that implements the common operations (mainly reductions and scans). As part of enabling this, the _copy_type_metadata method was refactored to have different column types override it to perform column-specific tasks (necessary because DecimalColumn has some special handling), which should also provide some marginal performance improvements for methods that copy metadata.

This PR (and some to follow) are designed to help simplify the ongoing [Array refactoring](#8273).

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Keith Kraus (https://github.com/kkraus14)
  - GALI PREM SAGAR (https://github.com/galipremsagar)

URL: #8278
  • Loading branch information
vyasr authored May 26, 2021
1 parent e97fc1c commit cfc7ef9
Show file tree
Hide file tree
Showing 8 changed files with 282 additions and 345 deletions.
22 changes: 22 additions & 0 deletions python/cudf/cudf/core/column/categorical.py
Original file line number Diff line number Diff line change
Expand Up @@ -1506,6 +1506,28 @@ def _concat(objs: MutableSequence[CategoricalColumn]) -> CategoricalColumn:
offset=codes_col.offset,
)

def _copy_type_metadata(
self: CategoricalColumn, other: ColumnBase
) -> ColumnBase:
"""Copies type metadata from self onto other, returning a new column.
In addition to the default behavior, if `other` is not a
CategoricalColumn, we assume other is a column of codes, and return a
CategoricalColumn composed of `other` and the categories of `self`.
"""
if not isinstance(other, cudf.core.column.CategoricalColumn):
other = column.build_categorical_column(
categories=self.categories,
codes=column.as_column(other.base_data, dtype=other.dtype),
mask=other.base_mask,
ordered=self.ordered,
size=other.size,
offset=other.offset,
null_count=other.null_count,
)
# Have to ignore typing here because it misdiagnoses super().
return super()._copy_type_metadata(other) # type: ignore


def _create_empty_categorical_column(
categorical_column: CategoricalColumn, dtype: "CategoricalDtype"
Expand Down
52 changes: 9 additions & 43 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,7 +8,6 @@
from types import SimpleNamespace
from typing import (
Any,
Callable,
Dict,
List,
MutableSequence,
Expand Down Expand Up @@ -310,16 +309,6 @@ def _memory_usage(self, **kwargs) -> int:
def default_na_value(self) -> Any:
raise NotImplementedError()

def applymap(
self, udf: Callable[[ScalarLike], ScalarLike], out_dtype: Dtype = None
) -> ColumnBase:
"""Apply an element-wise function to the values in the Column."""
# Subclasses that support applymap must override this behavior.
raise TypeError(
"User-defined functions are currently not supported on data "
f"with dtype {self.dtype}."
)

def to_gpu_array(self, fillna=None) -> "cuda.devicearray.DeviceNDArray":
"""Get a dense numba device array for the data.
Expand Down Expand Up @@ -1139,6 +1128,11 @@ def binary_operator(
f"{other.dtype}."
)

def normalize_binop_value(
self, other: ScalarLike
) -> Union[ColumnBase, ScalarLike]:
raise NotImplementedError

def min(self, skipna: bool = None, dtype: Dtype = None):
result_col = self._process_for_reduction(skipna=skipna)
if isinstance(result_col, ColumnBase):
Expand Down Expand Up @@ -1273,46 +1267,18 @@ def scatter_to_table(
}
)

def _copy_type_metadata(self: T, other: ColumnBase) -> ColumnBase:
def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
"""
Copies type metadata from self onto other, returning a new column.
* when `self` is a CategoricalColumn and `other` is not, we assume
other is a column of codes, and return a CategoricalColumn composed
of `other` and the categories of `self`.
* when both `self` and `other` are StructColumns, rename the fields
of `other` to the field names of `self`.
* when both `self` and `other` are DecimalColumns, copy the precision
from self.dtype to other.dtype
* when `self` and `other` are nested columns of the same type,
recursively apply this function on the children of `self` to the
and the children of `other`.
* if none of the above, return `other` without any changes
"""
if isinstance(self, cudf.core.column.CategoricalColumn) and not (
isinstance(other, cudf.core.column.CategoricalColumn)
):
other = build_categorical_column(
categories=self.categories,
codes=as_column(other.base_data, dtype=other.dtype),
mask=other.base_mask,
ordered=self.ordered,
size=other.size,
offset=other.offset,
null_count=other.null_count,
)

if isinstance(other, cudf.core.column.StructColumn) and isinstance(
self, cudf.core.column.StructColumn
):
other = other._rename_fields(self.dtype.fields.keys())

if isinstance(other, cudf.core.column.DecimalColumn) and isinstance(
self, cudf.core.column.DecimalColumn
):
other.dtype.precision = self.dtype.precision

if type(self) is type(other):
# TODO: This logic should probably be moved to a common nested column
# class.
if isinstance(other, type(self)):
if self.base_children and other.base_children:
base_children = tuple(
self.base_children[i]._copy_type_metadata(
Expand Down
83 changes: 17 additions & 66 deletions python/cudf/cudf/core/column/decimal.py
Original file line number Diff line number Diff line change
@@ -1,7 +1,6 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from decimal import Decimal
from numbers import Number
from typing import Any, Sequence, Tuple, Union, cast

import cupy as cp
Expand All @@ -22,11 +21,16 @@
from cudf.utils.dtypes import is_scalar
from cudf.utils.utils import pa_mask_buffer_to_mask

from .numerical_base import NumericalBaseColumn

class DecimalColumn(ColumnBase):

class DecimalColumn(NumericalBaseColumn):
dtype: Decimal64Dtype

def __truediv__(self, other):
# TODO: This override is not sufficient. While it will change the
# behavior of x / y for two decimal columns, it will not affect
# col1.binary_operator(col2), which is how Series/Index will call this.
return self.binary_operator("div", other)

def __setitem__(self, key, value):
Expand Down Expand Up @@ -123,39 +127,6 @@ def normalize_binop_value(self, other):
else:
raise TypeError(f"cannot normalize {type(other)}")

def _apply_scan_op(self, op: str) -> ColumnBase:
result = libcudf.reduce.scan(op, self, True)
return self._copy_type_metadata(result)

def quantile(
self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
) -> ColumnBase:
if isinstance(q, Number) or cudf.utils.dtypes.is_list_like(q):
np_array_q = np.asarray(q)
if np.logical_or(np_array_q < 0, np_array_q > 1).any():
raise ValueError(
"percentiles should all be in the interval [0, 1]"
)
# Beyond this point, q either being scalar or list-like
# will only have values in range [0, 1]
result = self._decimal_quantile(q, interpolation, exact)
if isinstance(q, Number):
return (
cudf.utils.dtypes._get_nan_for_dtype(self.dtype)
if result[0] is cudf.NA
else result[0]
)
return result

def median(self, skipna: bool = None) -> ColumnBase:
skipna = True if skipna is None else skipna

if not skipna and self.has_nulls:
return cudf.utils.dtypes._get_nan_for_dtype(self.dtype)

# enforce linear in case the default ever changes
return self.quantile(0.5, interpolation="linear", exact=True)

def _decimal_quantile(
self, q: Union[float, Sequence[float]], interpolation: str, exact: bool
) -> ColumnBase:
Expand Down Expand Up @@ -194,37 +165,6 @@ def as_string_column(
"cudf.core.column.StringColumn", as_column([], dtype="object")
)

def reduce(self, op: str, skipna: bool = None, **kwargs) -> Decimal:
min_count = kwargs.pop("min_count", 0)
preprocessed = self._process_for_reduction(
skipna=skipna, min_count=min_count
)
if isinstance(preprocessed, ColumnBase):
return libcudf.reduce.reduce(op, preprocessed, **kwargs)
else:
return preprocessed

def sum(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
) -> Decimal:
return self.reduce(
"sum", skipna=skipna, dtype=dtype, min_count=min_count
)

def product(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
) -> Decimal:
return self.reduce(
"product", skipna=skipna, dtype=dtype, min_count=min_count
)

def sum_of_squares(
self, skipna: bool = None, dtype: Dtype = None, min_count: int = 0
) -> Decimal:
return self.reduce(
"sum_of_squares", skipna=skipna, dtype=dtype, min_count=min_count
)

def fillna(
self, value: Any = None, method: str = None, dtype: Dtype = None
):
Expand Down Expand Up @@ -269,6 +209,17 @@ def __cuda_array_interface__(self):
"Decimals are not yet supported via `__cuda_array_interface__`"
)

def _copy_type_metadata(self: ColumnBase, other: ColumnBase) -> ColumnBase:
"""Copies type metadata from self onto other, returning a new column.
In addition to the default behavior, if `other` is also a decimal
column the precision is copied over.
"""
if isinstance(other, DecimalColumn):
other.dtype.precision = self.dtype.precision # type: ignore
# Have to ignore typing here because it misdiagnoses super().
return super()._copy_type_metadata(other) # type: ignore


def _binop_scale(l_dtype, r_dtype, op):
# This should at some point be hooked up to libcudf's
Expand Down
Loading

0 comments on commit cfc7ef9

Please sign in to comment.