Skip to content

Commit

Permalink
Consolidate 1D pandas object handling in as_column (#14394)
Browse files Browse the repository at this point in the history
Currently `as_column` has a few different branches handling pandas objects. This PR consolidates `Series`, `Index` and `ExtensionArray` handling into 1 `if` branch such handling is consistent between the 3.

This also disallows `float16` types passed into `dtype` constructor arguments or typed `data` arguments

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Lawrence Mitchell (https://github.com/wence-)

URL: #14394
  • Loading branch information
mroeschke authored Nov 29, 2023
1 parent 4558344 commit e15290a
Show file tree
Hide file tree
Showing 6 changed files with 237 additions and 187 deletions.
31 changes: 24 additions & 7 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pandas.api import types as pd_types

import cudf
from cudf.core._compat import PANDAS_GE_150
from cudf.core.dtypes import ( # noqa: F401
_BaseDtype,
dtype,
Expand Down Expand Up @@ -444,15 +445,31 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool:
)


def _is_pandas_nullable_extension_dtype(dtype_to_check):
def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool:
if isinstance(
dtype_to_check, pd.api.extensions.ExtensionDtype
) and not isinstance(dtype_to_check, pd.core.dtypes.dtypes.PandasDtype):
if isinstance(dtype_to_check, pd.CategoricalDtype):
return _is_pandas_nullable_extension_dtype(
dtype_to_check.categories.dtype
)
dtype_to_check,
(
pd.UInt8Dtype,
pd.UInt16Dtype,
pd.UInt32Dtype,
pd.UInt64Dtype,
pd.Int8Dtype,
pd.Int16Dtype,
pd.Int32Dtype,
pd.Int64Dtype,
pd.Float32Dtype,
pd.Float64Dtype,
pd.BooleanDtype,
pd.StringDtype,
),
) or (PANDAS_GE_150 and isinstance(dtype_to_check, pd.ArrowDtype)):
return True
elif isinstance(dtype_to_check, pd.CategoricalDtype):
return _is_pandas_nullable_extension_dtype(
dtype_to_check.categories.dtype
)
elif isinstance(dtype_to_check, pd.IntervalDtype):
return _is_pandas_nullable_extension_dtype(dtype_to_check.subtype)
return False


Expand Down
261 changes: 113 additions & 148 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1964,11 +1964,10 @@ def as_column(
raise ValueError("Data must be 1-dimensional")
current_dtype = np.dtype(desc["typestr"])

arb_dtype = (
np.dtype("float32")
if current_dtype == "float16"
else cudf.dtype(current_dtype)
)
if current_dtype == "float16":
raise TypeError("Unsupported type float16")

arb_dtype = cudf.dtype(current_dtype)

if desc.get("mask", None) is not None:
# Extract and remove the mask from arbitrary before
Expand Down Expand Up @@ -2043,69 +2042,124 @@ def as_column(

return col

elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
if isinstance(arbitrary, pd.Series):
if isinstance(
arbitrary.array, pd.core.arrays.masked.BaseMaskedArray
):
return as_column(arbitrary.array)
elif PANDAS_GE_150 and isinstance(arbitrary.dtype, pd.ArrowDtype):
if cudf.get_option("mode.pandas_compatible"):
raise NotImplementedError("not supported")
return as_column(pa.array(arbitrary.array, from_pandas=True))
elif isinstance(arbitrary.dtype, pd.SparseDtype):
raise NotImplementedError(
f"{arbitrary.dtype} is not supported. Convert first to "
f"{arbitrary.dtype.subtype}."
)
if is_categorical_dtype(arbitrary.dtype):
if isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
data = as_column(pa.array(arbitrary, from_pandas=True))
elif is_interval_dtype(arbitrary.dtype):
if isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
elif isinstance(
arbitrary, (pd.Series, pd.Index, pd.api.extensions.ExtensionArray)
):
if isinstance(arbitrary.dtype, (pd.SparseDtype, pd.PeriodDtype)):
raise NotImplementedError(
f"cuDF does not yet support {type(arbitrary.dtype).__name__}"
)
elif (
cudf.get_option("mode.pandas_compatible")
and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex))
and arbitrary.freq is not None
):
raise NotImplementedError("freq is not implemented yet")
elif (
isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
or (
isinstance(arbitrary.dtype, pd.IntervalDtype)
and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype)
)
or (
isinstance(arbitrary.dtype, pd.CategoricalDtype)
and isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
)
data = as_column(pa.array(arbitrary, from_pandas=True))
elif arbitrary.dtype == np.bool_:
data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
elif arbitrary.dtype.kind in ("f"):
arb_dtype = np.dtype(arbitrary.dtype)
)
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
elif _is_pandas_nullable_extension_dtype(arbitrary.dtype):
if cudf.get_option("mode.pandas_compatible"):
raise NotImplementedError("not supported")
if isinstance(arbitrary, (pd.Series, pd.Index)):
# pandas arrays define __arrow_array__ for better
# pyarrow.array conversion
arbitrary = arbitrary.array
data = as_column(
cupy.asarray(arbitrary, dtype=arb_dtype),
pa.array(arbitrary, from_pandas=True),
nan_as_null=nan_as_null,
dtype=dtype,
length=length,
)
elif arbitrary.dtype.kind in ("u", "i"):
elif isinstance(
arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
):
data = as_column(
cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype
pa.array(arbitrary, from_pandas=True),
nan_as_null=nan_as_null,
dtype=dtype,
length=length,
)
elif isinstance(arbitrary.dtype, pd.PeriodDtype):
elif isinstance(
arbitrary.dtype, pd.api.extensions.ExtensionDtype
) and not isinstance(arbitrary, pd.arrays.PandasArray):
raise NotImplementedError(
"cuDF does not yet support `PeriodDtype`"
"Custom pandas ExtensionDtypes are not supported"
)
else:
if cudf.get_option(
"mode.pandas_compatible"
) and _is_pandas_nullable_extension_dtype(arbitrary.dtype):
raise NotImplementedError("not supported")
pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
if arbitrary.dtype == cudf.dtype("object") and cudf.dtype(
pyarrow_array.type.to_pandas_dtype()
) != cudf.dtype(arbitrary.dtype):
elif arbitrary.dtype.kind in "fiubmM":
# numpy dtype like
if isinstance(arbitrary, pd.arrays.PandasArray):
arbitrary = np.array(arbitrary)
arb_dtype = np.dtype(arbitrary.dtype)
if arb_dtype.kind == "f" and arb_dtype.itemsize == 2:
raise TypeError("Unsupported type float16")
elif arb_dtype.kind in "mM":
# not supported by cupy
arbitrary = np.asarray(arbitrary)
else:
arbitrary = cupy.asarray(arbitrary)
data = as_column(
arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length
)
elif arbitrary.dtype.kind == "O":
if isinstance(arbitrary, pd.arrays.PandasArray):
# infer_dtype does not handle PandasArray
arbitrary = np.array(arbitrary, dtype=object)
inferred_dtype = infer_dtype(arbitrary)
if inferred_dtype in ("mixed-integer", "mixed-integer-float"):
raise MixedTypeError("Cannot create column with mixed types")
elif inferred_dtype not in (
"mixed",
"decimal",
"string",
"empty",
"boolean",
):
raise TypeError(
f"Cannot convert a {inferred_dtype} of object type"
)
elif nan_as_null is False and (
pd.isna(arbitrary).any()
and inferred_dtype not in ("decimal", "empty")
):
# Decimal can hold float("nan")
# All np.nan is not restricted by type
raise MixedTypeError(f"Cannot have NaN with {inferred_dtype}")

pyarrow_array = pa.array(
arbitrary,
from_pandas=True,
)
if isinstance(pyarrow_array.type, pa.Decimal128Type):
pyarrow_type = cudf.Decimal128Dtype.from_arrow(
pyarrow_array.type
)
else:
pyarrow_type = arbitrary.dtype
data = as_column(pyarrow_array, dtype=pyarrow_type)
data = as_column(
pyarrow_array,
dtype=pyarrow_type,
nan_as_null=nan_as_null,
length=length,
)
else:
raise NotImplementedError(
f"{type(arbitrary).__name__} with "
f"{type(arbitrary.dtype).__name__} is not supported."
)
if dtype is not None:
data = data.astype(dtype)

Expand Down Expand Up @@ -2184,11 +2238,13 @@ def as_column(
arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]"))

buffer = as_buffer(arbitrary.view("|u1"))
mask = None
if nan_as_null is None or nan_as_null is True:
data = build_column(buffer, dtype=arbitrary.dtype)
data = _make_copy_replacing_NaT_with_null(data)
mask = data.mask
else:
bool_mask = as_column(~np.isnat(arbitrary))
mask = as_buffer(bools_to_mask(bool_mask))

data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
elif arb_dtype.kind == "m":
Expand All @@ -2199,11 +2255,13 @@ def as_column(
arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]"))

buffer = as_buffer(arbitrary.view("|u1"))
mask = None
if nan_as_null is None or nan_as_null is True:
data = build_column(buffer, dtype=arbitrary.dtype)
data = _make_copy_replacing_NaT_with_null(data)
mask = data.mask
else:
bool_mask = as_column(~np.isnat(arbitrary))
mask = as_buffer(bools_to_mask(bool_mask))

data = cudf.core.column.timedelta.TimeDeltaColumn(
data=buffer,
Expand Down Expand Up @@ -2233,7 +2291,7 @@ def as_column(
data = data.astype(dtype)
elif arb_dtype.kind in ("f"):
if arb_dtype == np.dtype("float16"):
arb_dtype = np.dtype("float32")
raise TypeError("Unsupported type float16")
arb_dtype = cudf.dtype(arb_dtype if dtype is None else dtype)
data = as_column(
cupy.asarray(arbitrary, dtype=arb_dtype),
Expand All @@ -2245,105 +2303,12 @@ def as_column(
if delayed_cast:
data = data.astype(cudf.dtype(dtype))

elif isinstance(arbitrary, pd.arrays.PandasArray):
if cudf.get_option(
"mode.pandas_compatible"
) and _is_pandas_nullable_extension_dtype(arbitrary.dtype):
raise NotImplementedError("not supported")
if is_categorical_dtype(arbitrary.dtype):
arb_dtype = arbitrary.dtype
else:
if arbitrary.dtype == pd.StringDtype():
arb_dtype = cudf.dtype("O")
else:
arb_dtype = (
cudf.dtype("float32")
if arbitrary.dtype == "float16"
else cudf.dtype(arbitrary.dtype)
)
if arb_dtype != arbitrary.dtype.numpy_dtype:
arbitrary = arbitrary.astype(arb_dtype)
if (
arbitrary.size != 0
and isinstance(arbitrary[0], pd._libs.interval.Interval)
and arb_dtype.kind in ("O")
):
# changing from pd array to series,possible arrow bug
interval_series = pd.Series(arbitrary)
data = as_column(
pa.Array.from_pandas(interval_series), dtype=arb_dtype
)
elif arb_dtype.kind in ("O", "U"):
pyarrow_array = pa.Array.from_pandas(arbitrary)
if not isinstance(
pyarrow_array,
(
pa.ListArray,
pa.StructArray,
pa.NullArray,
pa.Decimal128Array,
pa.StringArray,
pa.BooleanArray,
),
):
raise MixedTypeError("Cannot create column with mixed types")
data = as_column(pyarrow_array, dtype=arb_dtype)
else:
data = as_column(
pa.array(
arbitrary,
from_pandas=True if nan_as_null is None else nan_as_null,
),
nan_as_null=nan_as_null,
)
if dtype is not None:
data = data.astype(dtype)
elif isinstance(arbitrary, pd.arrays.SparseArray):
raise NotImplementedError(
f"{arbitrary.dtype} is not supported. Convert first to "
f"{arbitrary.dtype.subtype}."
)
elif isinstance(arbitrary, memoryview):
data = as_column(
np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
)
elif isinstance(arbitrary, cudf.Scalar):
data = ColumnBase.from_scalar(arbitrary, length if length else 1)
elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray):
if cudf.get_option("mode.pandas_compatible"):
raise NotImplementedError("not supported")
data = as_column(pa.Array.from_pandas(arbitrary), dtype=dtype)
elif (
(
isinstance(arbitrary, pd.DatetimeIndex)
and isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
)
or (
isinstance(arbitrary, pd.IntervalIndex)
and is_datetime64tz_dtype(arbitrary.dtype.subtype)
)
or (
isinstance(arbitrary, pd.CategoricalIndex)
and isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
)
)
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
elif isinstance(
arbitrary, (pd.core.arrays.period.PeriodArray, pd.PeriodIndex)
):
raise NotImplementedError(
f"cuDF does not yet support {type(arbitrary).__name__}"
)
elif (
cudf.get_option("mode.pandas_compatible")
and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex))
and arbitrary.freq is not None
):
raise NotImplementedError("freq is not implemented yet")
else:
try:
data = as_column(
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3374,7 +3374,6 @@ def describe(
data=data.values(),
index=data.keys(),
dtype=dtype,
nan_as_null=False,
name=self.name,
)

Expand Down
Loading

0 comments on commit e15290a

Please sign in to comment.