Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Consolidate 1D pandas object handling in as_column #14394

Merged
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
31 changes: 24 additions & 7 deletions python/cudf/cudf/api/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from pandas.api import types as pd_types

import cudf
from cudf.core._compat import PANDAS_GE_150
from cudf.core.dtypes import ( # noqa: F401
_BaseDtype,
dtype,
Expand Down Expand Up @@ -444,15 +445,31 @@ def is_any_real_numeric_dtype(arr_or_dtype) -> bool:
)


def _is_pandas_nullable_extension_dtype(dtype_to_check):
def _is_pandas_nullable_extension_dtype(dtype_to_check) -> bool:
if isinstance(
dtype_to_check, pd.api.extensions.ExtensionDtype
) and not isinstance(dtype_to_check, pd.core.dtypes.dtypes.PandasDtype):
if isinstance(dtype_to_check, pd.CategoricalDtype):
return _is_pandas_nullable_extension_dtype(
dtype_to_check.categories.dtype
)
dtype_to_check,
(
pd.UInt8Dtype,
pd.UInt16Dtype,
pd.UInt32Dtype,
pd.UInt64Dtype,
pd.Int8Dtype,
pd.Int16Dtype,
pd.Int32Dtype,
pd.Int64Dtype,
pd.Float32Dtype,
pd.Float64Dtype,
pd.BooleanDtype,
pd.StringDtype,
),
) or (PANDAS_GE_150 and isinstance(dtype_to_check, pd.ArrowDtype)):
return True
elif isinstance(dtype_to_check, pd.CategoricalDtype):
return _is_pandas_nullable_extension_dtype(
dtype_to_check.categories.dtype
)
elif isinstance(dtype_to_check, pd.IntervalDtype):
return _is_pandas_nullable_extension_dtype(dtype_to_check.subtype)
return False


Expand Down
261 changes: 113 additions & 148 deletions python/cudf/cudf/core/column/column.py
Original file line number Diff line number Diff line change
Expand Up @@ -1964,11 +1964,10 @@ def as_column(
raise ValueError("Data must be 1-dimensional")
current_dtype = np.dtype(desc["typestr"])

arb_dtype = (
np.dtype("float32")
if current_dtype == "float16"
else cudf.dtype(current_dtype)
)
if current_dtype == "float16":
raise TypeError("Unsupported type float16")

arb_dtype = cudf.dtype(current_dtype)

if desc.get("mask", None) is not None:
# Extract and remove the mask from arbitrary before
Expand Down Expand Up @@ -2043,69 +2042,124 @@ def as_column(

return col

elif isinstance(arbitrary, (pd.Series, pd.Categorical)):
if isinstance(arbitrary, pd.Series):
if isinstance(
arbitrary.array, pd.core.arrays.masked.BaseMaskedArray
):
return as_column(arbitrary.array)
elif PANDAS_GE_150 and isinstance(arbitrary.dtype, pd.ArrowDtype):
if cudf.get_option("mode.pandas_compatible"):
raise NotImplementedError("not supported")
return as_column(pa.array(arbitrary.array, from_pandas=True))
elif isinstance(arbitrary.dtype, pd.SparseDtype):
raise NotImplementedError(
f"{arbitrary.dtype} is not supported. Convert first to "
f"{arbitrary.dtype.subtype}."
)
if is_categorical_dtype(arbitrary.dtype):
if isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
data = as_column(pa.array(arbitrary, from_pandas=True))
elif is_interval_dtype(arbitrary.dtype):
if isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
elif isinstance(
arbitrary, (pd.Series, pd.Index, pd.api.extensions.ExtensionArray)
):
if isinstance(arbitrary.dtype, (pd.SparseDtype, pd.PeriodDtype)):
raise NotImplementedError(
f"cuDF does not yet support {type(arbitrary.dtype).__name__}"
)
elif (
cudf.get_option("mode.pandas_compatible")
and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex))
and arbitrary.freq is not None
):
raise NotImplementedError("freq is not implemented yet")
elif (
isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
or (
isinstance(arbitrary.dtype, pd.IntervalDtype)
and isinstance(arbitrary.dtype.subtype, pd.DatetimeTZDtype)
)
or (
isinstance(arbitrary.dtype, pd.CategoricalDtype)
and isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
)
data = as_column(pa.array(arbitrary, from_pandas=True))
elif arbitrary.dtype == np.bool_:
data = as_column(cupy.asarray(arbitrary), dtype=arbitrary.dtype)
elif arbitrary.dtype.kind in ("f"):
arb_dtype = np.dtype(arbitrary.dtype)
)
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
elif _is_pandas_nullable_extension_dtype(arbitrary.dtype):
if cudf.get_option("mode.pandas_compatible"):
raise NotImplementedError("not supported")
Comment on lines +2075 to +2076
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

question: What happens to this type of data if we're not in pandas-compat mode? And why is it not supported if we are?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens to this type of data if we're not in pandas-compat mode?

We currently convert this correctly to a corresponding cudf type

In [1]: import cudf; import pandas as pd

In [2]: cudf.from_pandas(pd.Series([1], dtype=pd.Int64Dtype()))
Out[2]: 
0    1
dtype: int64

And why is it not supported if we are?

We disallowed it for cudf.pandas since currently we cannot roundtrip back to pandas correctly since cudf doesn't keep track whether a e.g. numpy.int64 or pandas.Int64Dtype was passed in

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Ah, makes sense, thanks.

if isinstance(arbitrary, (pd.Series, pd.Index)):
# pandas arrays define __arrow_array__ for better
# pyarrow.array conversion
arbitrary = arbitrary.array
wence- marked this conversation as resolved.
Show resolved Hide resolved
data = as_column(
cupy.asarray(arbitrary, dtype=arb_dtype),
pa.array(arbitrary, from_pandas=True),
nan_as_null=nan_as_null,
dtype=dtype,
length=length,
)
elif arbitrary.dtype.kind in ("u", "i"):
elif isinstance(
arbitrary.dtype, (pd.CategoricalDtype, pd.IntervalDtype)
):
data = as_column(
cupy.asarray(arbitrary), nan_as_null=nan_as_null, dtype=dtype
pa.array(arbitrary, from_pandas=True),
nan_as_null=nan_as_null,
dtype=dtype,
length=length,
)
elif isinstance(arbitrary.dtype, pd.PeriodDtype):
elif isinstance(
arbitrary.dtype, pd.api.extensions.ExtensionDtype
) and not isinstance(arbitrary, pd.arrays.PandasArray):
raise NotImplementedError(
"cuDF does not yet support `PeriodDtype`"
"Custom pandas ExtensionDtypes are not supported"
)
else:
if cudf.get_option(
"mode.pandas_compatible"
) and _is_pandas_nullable_extension_dtype(arbitrary.dtype):
raise NotImplementedError("not supported")
pyarrow_array = pa.array(arbitrary, from_pandas=nan_as_null)
if arbitrary.dtype == cudf.dtype("object") and cudf.dtype(
pyarrow_array.type.to_pandas_dtype()
) != cudf.dtype(arbitrary.dtype):
elif arbitrary.dtype.kind in "fiubmM":
# numpy dtype like
if isinstance(arbitrary, pd.arrays.PandasArray):
arbitrary = np.array(arbitrary)
arb_dtype = np.dtype(arbitrary.dtype)
if arb_dtype.kind == "f" and arb_dtype.itemsize == 2:
raise TypeError("Unsupported type float16")
elif arb_dtype.kind in "mM":
# not supported by cupy
arbitrary = np.asarray(arbitrary)
else:
arbitrary = cupy.asarray(arbitrary)
data = as_column(
arbitrary, nan_as_null=nan_as_null, dtype=dtype, length=length
)
elif arbitrary.dtype.kind == "O":
if isinstance(arbitrary, pd.arrays.PandasArray):
# infer_dtype does not handle PandasArray
arbitrary = np.array(arbitrary, dtype=object)
inferred_dtype = infer_dtype(arbitrary)
if inferred_dtype in ("mixed-integer", "mixed-integer-float"):
raise MixedTypeError("Cannot create column with mixed types")
elif inferred_dtype not in (
"mixed",
"decimal",
"string",
"empty",
"boolean",
):
raise TypeError(
f"Cannot convert a {inferred_dtype} of object type"
)
elif nan_as_null is False and (
pd.isna(arbitrary).any()
and inferred_dtype not in ("decimal", "empty")
):
# Decimal can hold float("nan")
# All np.nan is not restricted by type
raise MixedTypeError(f"Cannot have NaN with {inferred_dtype}")

pyarrow_array = pa.array(
arbitrary,
from_pandas=True,
)
if isinstance(pyarrow_array.type, pa.Decimal128Type):
pyarrow_type = cudf.Decimal128Dtype.from_arrow(
pyarrow_array.type
)
else:
pyarrow_type = arbitrary.dtype
data = as_column(pyarrow_array, dtype=pyarrow_type)
data = as_column(
pyarrow_array,
dtype=pyarrow_type,
nan_as_null=nan_as_null,
length=length,
)
else:
raise NotImplementedError(
f"{type(arbitrary).__name__} with "
f"{type(arbitrary.dtype).__name__} is not supported."
)
if dtype is not None:
data = data.astype(dtype)

Expand Down Expand Up @@ -2184,11 +2238,13 @@ def as_column(
arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]"))

buffer = as_buffer(arbitrary.view("|u1"))
mask = None
if nan_as_null is None or nan_as_null is True:
data = build_column(buffer, dtype=arbitrary.dtype)
data = _make_copy_replacing_NaT_with_null(data)
mask = data.mask
else:
bool_mask = as_column(~np.isnat(arbitrary))
mask = as_buffer(bools_to_mask(bool_mask))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This seems like it turns NaT into nulls, is that the intention?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah I think we still want to consider NaT values when creating the mask (as null values), but not necessarily cast the value to NA as tested in test_series_np_array_nat_nan_as_null_false. cc @galipremsagar

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yeah, at one point of time we supported both NA and NaT for datetime & timedelta columns. Now we just treat NA as NaT and vice-versa for datetime & timedelta column. With recent pandas accelerator mode work, we decided to just repr out NA as NAT. So yes we need to mark the mask when we have NAT's anywhere.


data = build_column(data=buffer, mask=mask, dtype=arbitrary.dtype)
elif arb_dtype.kind == "m":
Expand All @@ -2199,11 +2255,13 @@ def as_column(
arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]"))

buffer = as_buffer(arbitrary.view("|u1"))
mask = None
if nan_as_null is None or nan_as_null is True:
data = build_column(buffer, dtype=arbitrary.dtype)
data = _make_copy_replacing_NaT_with_null(data)
mask = data.mask
else:
bool_mask = as_column(~np.isnat(arbitrary))
mask = as_buffer(bools_to_mask(bool_mask))
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Same question here.


data = cudf.core.column.timedelta.TimeDeltaColumn(
data=buffer,
Expand Down Expand Up @@ -2233,7 +2291,7 @@ def as_column(
data = data.astype(dtype)
elif arb_dtype.kind in ("f"):
if arb_dtype == np.dtype("float16"):
arb_dtype = np.dtype("float32")
raise TypeError("Unsupported type float16")
arb_dtype = cudf.dtype(arb_dtype if dtype is None else dtype)
data = as_column(
cupy.asarray(arbitrary, dtype=arb_dtype),
Expand All @@ -2245,105 +2303,12 @@ def as_column(
if delayed_cast:
data = data.astype(cudf.dtype(dtype))

elif isinstance(arbitrary, pd.arrays.PandasArray):
if cudf.get_option(
"mode.pandas_compatible"
) and _is_pandas_nullable_extension_dtype(arbitrary.dtype):
raise NotImplementedError("not supported")
if is_categorical_dtype(arbitrary.dtype):
arb_dtype = arbitrary.dtype
else:
if arbitrary.dtype == pd.StringDtype():
arb_dtype = cudf.dtype("O")
else:
arb_dtype = (
cudf.dtype("float32")
if arbitrary.dtype == "float16"
else cudf.dtype(arbitrary.dtype)
)
if arb_dtype != arbitrary.dtype.numpy_dtype:
arbitrary = arbitrary.astype(arb_dtype)
if (
arbitrary.size != 0
and isinstance(arbitrary[0], pd._libs.interval.Interval)
and arb_dtype.kind in ("O")
):
# changing from pd array to series,possible arrow bug
interval_series = pd.Series(arbitrary)
data = as_column(
pa.Array.from_pandas(interval_series), dtype=arb_dtype
)
elif arb_dtype.kind in ("O", "U"):
pyarrow_array = pa.Array.from_pandas(arbitrary)
if not isinstance(
pyarrow_array,
(
pa.ListArray,
pa.StructArray,
pa.NullArray,
pa.Decimal128Array,
pa.StringArray,
pa.BooleanArray,
),
):
raise MixedTypeError("Cannot create column with mixed types")
data = as_column(pyarrow_array, dtype=arb_dtype)
else:
data = as_column(
pa.array(
arbitrary,
from_pandas=True if nan_as_null is None else nan_as_null,
),
nan_as_null=nan_as_null,
)
if dtype is not None:
data = data.astype(dtype)
elif isinstance(arbitrary, pd.arrays.SparseArray):
raise NotImplementedError(
f"{arbitrary.dtype} is not supported. Convert first to "
f"{arbitrary.dtype.subtype}."
)
elif isinstance(arbitrary, memoryview):
data = as_column(
np.asarray(arbitrary), dtype=dtype, nan_as_null=nan_as_null
)
elif isinstance(arbitrary, cudf.Scalar):
data = ColumnBase.from_scalar(arbitrary, length if length else 1)
elif isinstance(arbitrary, pd.core.arrays.masked.BaseMaskedArray):
if cudf.get_option("mode.pandas_compatible"):
raise NotImplementedError("not supported")
data = as_column(pa.Array.from_pandas(arbitrary), dtype=dtype)
elif (
(
isinstance(arbitrary, pd.DatetimeIndex)
and isinstance(arbitrary.dtype, pd.DatetimeTZDtype)
)
or (
isinstance(arbitrary, pd.IntervalIndex)
and is_datetime64tz_dtype(arbitrary.dtype.subtype)
)
or (
isinstance(arbitrary, pd.CategoricalIndex)
and isinstance(
arbitrary.dtype.categories.dtype, pd.DatetimeTZDtype
)
)
):
raise NotImplementedError(
"cuDF does not yet support timezone-aware datetimes"
)
elif isinstance(
arbitrary, (pd.core.arrays.period.PeriodArray, pd.PeriodIndex)
):
raise NotImplementedError(
f"cuDF does not yet support {type(arbitrary).__name__}"
)
elif (
cudf.get_option("mode.pandas_compatible")
and isinstance(arbitrary, (pd.DatetimeIndex, pd.TimedeltaIndex))
and arbitrary.freq is not None
):
raise NotImplementedError("freq is not implemented yet")
else:
try:
data = as_column(
Expand Down
1 change: 0 additions & 1 deletion python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -3374,7 +3374,6 @@ def describe(
data=data.values(),
index=data.keys(),
dtype=dtype,
nan_as_null=False,
name=self.name,
)

Expand Down
Loading