Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix type of empty Index and raise warning in Series constructor #14116

Merged
merged 5 commits into from
Sep 20, 2023
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
21 changes: 14 additions & 7 deletions python/cudf/cudf/core/algorithms.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,12 +4,13 @@
import cupy as cp
import numpy as np

from cudf.core.column import as_column
from cudf.core.copy_types import BooleanMask
from cudf.core.index import Index, RangeIndex
from cudf.core.index import RangeIndex, as_index
from cudf.core.indexed_frame import IndexedFrame
from cudf.core.scalar import Scalar
from cudf.core.series import Series
from cudf.options import get_option
from cudf.utils.dtypes import can_convert_to_column


def factorize(
Expand Down Expand Up @@ -95,7 +96,13 @@ def factorize(

return_cupy_array = isinstance(values, cp.ndarray)

values = Series(values)
if not can_convert_to_column(values):
raise TypeError(
"'values' can only be a Series, Index, or CuPy array, "
f"got {type(values)}"
)

values = as_column(values)

if na_sentinel is None:
na_sentinel = (
Expand Down Expand Up @@ -128,22 +135,22 @@ def factorize(
warnings.warn("size_hint is not applicable for cudf.factorize")

if use_na_sentinel is None or use_na_sentinel:
cats = values._column.dropna()
cats = values.dropna()
else:
cats = values._column
cats = values

cats = cats.unique().astype(values.dtype)

if sort:
cats = cats.sort_values()

labels = values._column._label_encoding(
labels = values._label_encoding(
cats=cats,
na_sentinel=Scalar(na_sentinel),
dtype="int64" if get_option("mode.pandas_compatible") else None,
).values

return labels, cats.values if return_cupy_array else Index(cats)
return labels, cats.values if return_cupy_array else as_index(cats)


def _linear_interpolation(column, index=None):
Expand Down
2 changes: 1 addition & 1 deletion python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -5606,7 +5606,7 @@ def quantile(
result.name = q
return result

result.index = list(map(float, qs))
result.index = cudf.Index(list(map(float, qs)), dtype="float64")
return result

@_cudf_nvtx_annotate
Expand Down
12 changes: 11 additions & 1 deletion python/cudf/cudf/core/index.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,7 @@
List,
MutableMapping,
Optional,
Sequence,
Tuple,
Type,
Union,
Expand Down Expand Up @@ -3467,14 +3468,23 @@ def __new__(
"tupleize_cols != True is not yet supported"
)

return as_index(
res = as_index(
data,
copy=copy,
dtype=dtype,
name=name,
nan_as_null=nan_as_null,
**kwargs,
)
if (
isinstance(data, Sequence)
and not isinstance(data, range)
and len(data) == 0
and dtype is None
and getattr(data, "dtype", None) is None
):
return res.astype("str")
return res

@classmethod
@_cudf_nvtx_annotate
Expand Down
32 changes: 29 additions & 3 deletions python/cudf/cudf/core/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,16 @@
import warnings
from collections import abc
from shutil import get_terminal_size
from typing import Any, Dict, MutableMapping, Optional, Set, Tuple, Union
from typing import (
Any,
Dict,
MutableMapping,
Optional,
Sequence,
Set,
Tuple,
Union,
)

import cupy
import numpy as np
Expand Down Expand Up @@ -500,6 +509,18 @@ def __init__(
copy=False,
nan_as_null=True,
):
if (
isinstance(data, Sequence)
and len(data) == 0
and dtype is None
and getattr(data, "dtype", None) is None
):
warnings.warn(
"The default dtype for empty Series will be 'object' instead "
"of 'float64' in a future version. Specify a dtype explicitly "
"to silence this warning.",
FutureWarning,
)
if isinstance(data, pd.Series):
if name is None:
name = data.name
Expand Down Expand Up @@ -656,7 +677,10 @@ def from_pandas(cls, s, nan_as_null=None):
3 NaN
dtype: float64
"""
return cls(s, nan_as_null=nan_as_null)
with warnings.catch_warnings():
warnings.simplefilter("ignore")
result = cls(s, nan_as_null=nan_as_null)
return result

@property # type: ignore
@_cudf_nvtx_annotate
Expand Down Expand Up @@ -2642,7 +2666,9 @@ def mode(self, dropna=True):
if len(val_counts) > 0:
val_counts = val_counts[val_counts == val_counts.iloc[0]]

return Series(val_counts.index.sort_values(), name=self.name)
return Series._from_data(
{self.name: val_counts.index.sort_values()}, name=self.name
)

@_cudf_nvtx_annotate
def round(self, decimals=0, how="half_even"):
Expand Down
21 changes: 19 additions & 2 deletions python/cudf/cudf/testing/_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -397,15 +397,32 @@ def assert_column_memory_ne(
raise AssertionError("lhs and rhs holds the same memory.")


def _create_pandas_series(data=None, index=None, dtype=None, *args, **kwargs):
# Wrapper around pd.Series using a float64 default dtype for empty data.
def _create_pandas_series_float64_default(
data=None, index=None, dtype=None, *args, **kwargs
):
# Wrapper around pd.Series using a float64
# default dtype for empty data to silence warnings.
# TODO: Remove this in pandas-2.0 upgrade
if dtype is None and (
data is None or (not is_scalar(data) and len(data) == 0)
):
dtype = "float64"
return pd.Series(data=data, index=index, dtype=dtype, *args, **kwargs)


def _create_cudf_series_float64_default(
data=None, index=None, dtype=None, *args, **kwargs
):
# Wrapper around cudf.Series using a float64
# default dtype for empty data to silence warnings.
# TODO: Remove this in pandas-2.0 upgrade
if dtype is None and (
data is None or (not is_scalar(data) and len(data) == 0)
):
dtype = "float64"
return cudf.Series(data=data, index=index, dtype=dtype, *args, **kwargs)


parametrize_numeric_dtypes_pairwise = pytest.mark.parametrize(
"left_dtype,right_dtype",
list(itertools.combinations_with_replacement(NUMERIC_TYPES, 2)),
Expand Down
19 changes: 10 additions & 9 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,7 @@
ALL_TYPES,
DATETIME_TYPES,
NUMERIC_TYPES,
_create_cudf_series_float64_default,
assert_eq,
assert_exceptions_equal,
assert_neq,
Expand Down Expand Up @@ -2000,8 +2001,8 @@ def test_series_shape():


def test_series_shape_empty():
ps = pd.Series(dtype="float64")
cs = cudf.Series([])
ps = pd.Series([], dtype="float64")
cs = cudf.Series([], dtype="float64")

assert ps.shape == cs.shape

Expand Down Expand Up @@ -2840,7 +2841,7 @@ def test_series_all_null(num_elements, null_type):
@pytest.mark.parametrize("num_elements", [0, 2, 10, 100])
def test_series_all_valid_nan(num_elements):
data = [np.nan] * num_elements
sr = cudf.Series(data, nan_as_null=False)
sr = _create_cudf_series_float64_default(data, nan_as_null=False)
np.testing.assert_equal(sr.null_count, 0)


Expand Down Expand Up @@ -4073,28 +4074,28 @@ def test_empty_dataframe_describe():


def test_as_column_types():
col = column.as_column(cudf.Series([]))
col = column.as_column(cudf.Series([], dtype="float64"))
assert_eq(col.dtype, np.dtype("float64"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="float64"))

assert_eq(pds, gds)

col = column.as_column(cudf.Series([]), dtype="float32")
col = column.as_column(cudf.Series([], dtype="float64"), dtype="float32")
assert_eq(col.dtype, np.dtype("float32"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="float32"))

assert_eq(pds, gds)

col = column.as_column(cudf.Series([]), dtype="str")
col = column.as_column(cudf.Series([], dtype="float64"), dtype="str")
assert_eq(col.dtype, np.dtype("object"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="str"))

assert_eq(pds, gds)

col = column.as_column(cudf.Series([]), dtype="object")
col = column.as_column(cudf.Series([], dtype="float64"), dtype="object")
assert_eq(col.dtype, np.dtype("object"))
gds = cudf.Series(col)
pds = pd.Series(pd.Series([], dtype="object"))
Expand Down Expand Up @@ -4469,7 +4470,7 @@ def test_create_dataframe_column():
)
def test_series_values_host_property(data):
pds = pd.Series(data=data, dtype=None if data else float)
gds = cudf.Series(data)
gds = _create_cudf_series_float64_default(data)

np.testing.assert_array_equal(pds.values, gds.values_host)

Expand All @@ -4492,7 +4493,7 @@ def test_series_values_host_property(data):
)
def test_series_values_property(data):
pds = pd.Series(data=data, dtype=None if data else float)
gds = cudf.Series(data)
gds = _create_cudf_series_float64_default(data)
gds_vals = gds.values
assert isinstance(gds_vals, cupy.ndarray)
np.testing.assert_array_equal(gds_vals.get(), pds.values)
Expand Down
9 changes: 6 additions & 3 deletions python/cudf/cudf/tests/test_dropna.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,14 @@
# Copyright (c) 2020-2022, NVIDIA CORPORATION.
# Copyright (c) 2020-2023, NVIDIA CORPORATION.

import numpy as np
import pandas as pd
import pytest

import cudf
from cudf.testing._utils import _create_pandas_series, assert_eq
from cudf.testing._utils import (
_create_pandas_series_float64_default,
assert_eq,
)


@pytest.mark.parametrize(
Expand All @@ -22,7 +25,7 @@
@pytest.mark.parametrize("inplace", [True, False])
def test_dropna_series(data, nulls, inplace):

psr = _create_pandas_series(data)
psr = _create_pandas_series_float64_default(data)

if len(data) > 0:
if nulls == "one":
Expand Down
4 changes: 2 additions & 2 deletions python/cudf/cudf/tests/test_duplicates.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@
import cudf
from cudf import concat
from cudf.testing._utils import (
_create_pandas_series,
_create_pandas_series_float64_default,
assert_eq,
assert_exceptions_equal,
)
Expand Down Expand Up @@ -62,7 +62,7 @@ def test_duplicated_with_misspelled_column_name(subset):
],
)
def test_drop_duplicates_series(data, keep):
pds = _create_pandas_series(data)
pds = _create_pandas_series_float64_default(data)
gds = cudf.from_pandas(pds)

assert_df(pds.drop_duplicates(keep=keep), gds.drop_duplicates(keep=keep))
Expand Down
16 changes: 12 additions & 4 deletions python/cudf/cudf/tests/test_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,8 @@
SIGNED_INTEGER_TYPES,
SIGNED_TYPES,
UNSIGNED_TYPES,
_create_pandas_series,
_create_cudf_series_float64_default,
_create_pandas_series_float64_default,
assert_column_memory_eq,
assert_column_memory_ne,
assert_eq,
Expand Down Expand Up @@ -1006,8 +1007,8 @@ def test_index_equal_misc(data, other):
actual = gd_data.equals(np.array(gd_other))
assert_eq(expected, actual)

expected = pd_data.equals(_create_pandas_series(pd_other))
actual = gd_data.equals(cudf.Series(gd_other))
expected = pd_data.equals(_create_pandas_series_float64_default(pd_other))
actual = gd_data.equals(_create_cudf_series_float64_default(gd_other))
assert_eq(expected, actual)

expected = pd_data.astype("category").equals(pd_other)
Expand Down Expand Up @@ -2275,7 +2276,7 @@ def test_index_nan_as_null(data, nan_idx, NA_idx, nan_as_null):
],
)
def test_isin_index(data, values):
psr = _create_pandas_series(data)
psr = _create_pandas_series_float64_default(data)
gsr = cudf.Series.from_pandas(psr)

got = gsr.index.isin(values)
Expand Down Expand Up @@ -2780,6 +2781,13 @@ def test_index_empty_from_pandas(request, dtype):
assert_eq(pidx, gidx)


def test_empty_index_init():
pidx = pd.Index([])
gidx = cudf.Index([])

assert_eq(pidx, gidx)


@pytest.mark.parametrize(
"data", [[1, 2, 3], ["ab", "cd", "e", None], range(0, 10)]
)
Expand Down
9 changes: 6 additions & 3 deletions python/cudf/cudf/tests/test_rolling.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,10 @@

import cudf
from cudf.core._compat import PANDAS_GE_150, PANDAS_LT_140
from cudf.testing._utils import _create_pandas_series, assert_eq
from cudf.testing._utils import (
_create_pandas_series_float64_default,
assert_eq,
)
from cudf.testing.dataset_generator import rand_dataframe


Expand Down Expand Up @@ -55,7 +58,7 @@ def test_rolling_series_basic(data, index, agg, nulls, center):
elif nulls == "all":
data = [np.nan] * len(data)

psr = _create_pandas_series(data, index=index)
psr = _create_pandas_series_float64_default(data, index=index)
gsr = cudf.Series(psr)
for window_size in range(1, len(data) + 1):
for min_periods in range(1, window_size + 1):
Expand Down Expand Up @@ -313,7 +316,7 @@ def test_rolling_getitem_window():
@pytest.mark.parametrize("center", [True, False])
def test_rollling_series_numba_udf_basic(data, index, center):

psr = _create_pandas_series(data, index=index)
psr = _create_pandas_series_float64_default(data, index=index)
gsr = cudf.from_pandas(psr)

def some_func(A):
Expand Down
Loading