Skip to content

Commit

Permalink
refactor(python): unify nan_to_null and nan_to_none parameter nam…
Browse files Browse the repository at this point in the history
…es, expose to DataFrame init, add test coverage (#6637)
  • Loading branch information
alexander-beedie authored Feb 3, 2023
1 parent a1a5d4a commit 1f9bb06
Show file tree
Hide file tree
Showing 6 changed files with 104 additions and 38 deletions.
13 changes: 7 additions & 6 deletions py-polars/polars/convert.py
Original file line number Diff line number Diff line change
Expand Up @@ -392,7 +392,7 @@ def from_arrow(
def from_pandas(
df: pd.DataFrame,
rechunk: bool = True,
nan_to_none: bool = True,
nan_to_null: bool = True,
schema_overrides: SchemaDict | None = None,
) -> DataFrame:
...
Expand All @@ -402,16 +402,17 @@ def from_pandas(
def from_pandas(
df: pd.Series | pd.DatetimeIndex,
rechunk: bool = True,
nan_to_none: bool = True,
nan_to_null: bool = True,
schema_overrides: SchemaDict | None = None,
) -> Series:
...


@deprecated_alias(nan_to_none="nan_to_null")
def from_pandas(
df: pd.DataFrame | pd.Series | pd.DatetimeIndex,
rechunk: bool = True,
nan_to_none: bool = True,
nan_to_null: bool = True,
schema_overrides: SchemaDict | None = None,
) -> DataFrame | Series:
"""
Expand All @@ -427,7 +428,7 @@ def from_pandas(
Data represented as a pandas DataFrame, Series, or DatetimeIndex.
rechunk : bool, default True
Make sure that all data is in contiguous memory.
nan_to_none : bool, default True
nan_to_null : bool, default True
If data contains `NaN` values PyArrow will convert the ``NaN`` to ``None``
schema_overrides : dict, default None
Support override of inferred types for one or more columns.
Expand Down Expand Up @@ -470,12 +471,12 @@ def from_pandas(
"""
if isinstance(df, (pd.Series, pd.DatetimeIndex)):
return Series._from_pandas("", df, nan_to_none=nan_to_none)
return Series._from_pandas("", df, nan_to_null=nan_to_null)
elif isinstance(df, pd.DataFrame):
return DataFrame._from_pandas(
df,
rechunk=rechunk,
nan_to_none=nan_to_none,
nan_to_null=nan_to_null,
schema_overrides=schema_overrides,
)
else:
Expand Down
66 changes: 48 additions & 18 deletions py-polars/polars/internals/construction.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,13 @@
from polars.dependencies import pandas as pd
from polars.dependencies import pyarrow as pa
from polars.exceptions import ShapeError
from polars.utils import _is_generator, arrlen, range_to_series, threadpool_size
from polars.utils import (
_is_generator,
arrlen,
deprecated_alias,
range_to_series,
threadpool_size,
)

if version_info >= (3, 10):

Expand Down Expand Up @@ -455,9 +461,10 @@ def sequence_to_pyseries(
raise error


@deprecated_alias(nan_to_none="nan_to_null")
def _pandas_series_to_arrow(
values: pd.Series | pd.DatetimeIndex,
nan_to_none: bool = True,
nan_to_null: bool = True,
min_len: int | None = None,
) -> pa.Array:
"""
Expand All @@ -467,7 +474,7 @@ def _pandas_series_to_arrow(
----------
values : :class:`pandas.Series` or :class:`pandas.DatetimeIndex`
Series to convert to arrow
nan_to_none : bool, default = True
nan_to_null : bool, default = True
Interpret `NaN` as missing values
min_len : int, optional
in case of null values, this length will be used to create a dummy f64 array
Expand All @@ -482,12 +489,12 @@ def _pandas_series_to_arrow(
if dtype == "object":
first_non_none = _get_first_non_none(values.values) # type: ignore[arg-type]
if isinstance(first_non_none, str):
return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none)
return pa.array(values, pa.large_utf8(), from_pandas=nan_to_null)
elif first_non_none is None:
return pa.nulls(min_len, pa.large_utf8())
return pa.array(values, from_pandas=nan_to_none)
return pa.array(values, from_pandas=nan_to_null)
elif dtype:
return pa.array(values, from_pandas=nan_to_none)
return pa.array(values, from_pandas=nan_to_null)
else:
# Pandas Series is actually a Pandas DataFrame when the original dataframe
# contains duplicated columns and a duplicated column is requested with df["a"].
Expand All @@ -497,15 +504,16 @@ def _pandas_series_to_arrow(
)


@deprecated_alias(nan_to_none="nan_to_null")
def pandas_to_pyseries(
name: str, values: pd.Series | pd.DatetimeIndex, nan_to_none: bool = True
name: str, values: pd.Series | pd.DatetimeIndex, nan_to_null: bool = True
) -> PySeries:
"""Construct a PySeries from a pandas Series or DatetimeIndex."""
# TODO: Change `if not name` to `if name is not None` once name is Optional[str]
if not name and values.name is not None:
name = str(values.name)
return arrow_to_pyseries(
name, _pandas_series_to_arrow(values, nan_to_none=nan_to_none)
name, _pandas_series_to_arrow(values, nan_to_null=nan_to_null)
)


Expand Down Expand Up @@ -620,8 +628,9 @@ def _expand_dict_scalars(
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | pli.Series],
schema_overrides: SchemaDict | None = None,
order: Sequence[str] | None = None,
nan_to_null: bool = False,
) -> dict[str, pli.Series]:
"""Expand scalar values in dict data (propagate literal as array)."""
"""Expand any scalar values in dict data (propagate literal as array)."""
updated_data = {}
if data:
dtypes = schema_overrides or {}
Expand All @@ -633,7 +642,9 @@ def _expand_dict_scalars(
updated_data[name] = pli.DataFrame(val).to_struct(name)

elif arrlen(val) is not None or _is_generator(val):
updated_data[name] = pli.Series(name=name, values=val, dtype=dtype)
updated_data[name] = pli.Series(
name=name, values=val, dtype=dtype, nan_to_null=nan_to_null
)

elif val is None or isinstance( # type: ignore[redundant-expr]
val, (int, float, str, bool)
Expand Down Expand Up @@ -668,6 +679,7 @@ def dict_to_pydf(
data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | pli.Series],
schema: SchemaDefinition | None = None,
schema_overrides: SchemaDict | None = None,
nan_to_null: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a dictionary of sequences."""
if not schema:
Expand All @@ -686,11 +698,17 @@ def dict_to_pydf(
)
if not data and schema_overrides:
data_series = [
pli.Series(name, [], schema_overrides.get(name))._s for name in columns
pli.Series(
name, [], dtype=schema_overrides.get(name), nan_to_null=nan_to_null
)._s
for name in columns
]
else:
data_series = [
s._s for s in _expand_dict_scalars(data, schema_overrides).values()
s._s
for s in _expand_dict_scalars(
data, schema_overrides, nan_to_null=nan_to_null
).values()
]

data_series = _handle_columns_arg(data_series, columns=columns, from_dict=True)
Expand Down Expand Up @@ -921,6 +939,7 @@ def numpy_to_pydf(
schema: SchemaDefinition | None = None,
schema_overrides: SchemaDict | None = None,
orient: Orientation | None = None,
nan_to_null: bool = False,
) -> PyDataFrame:
"""Construct a PyDataFrame from a numpy ndarray."""
shape = data.shape
Expand Down Expand Up @@ -974,21 +993,31 @@ def numpy_to_pydf(

elif len(shape) == 1:
data_series = [
pli.Series(column_names[0], data, schema_overrides.get(column_names[0]))._s
pli.Series(
name=column_names[0],
values=data,
dtype=schema_overrides.get(column_names[0]),
nan_to_null=nan_to_null,
)._s
]

else:
if orient == "row":
data_series = [
pli.Series(
column_names[i], data[:, i], schema_overrides.get(column_names[i])
name=column_names[i],
values=data[:, i],
dtype=schema_overrides.get(column_names[i]),
nan_to_null=nan_to_null,
)._s
for i in range(n_columns)
]
else:
data_series = [
pli.Series(
column_names[i], data[i], schema_overrides.get(column_names[i])
name=column_names[i],
values=data[i],
dtype=schema_overrides.get(column_names[i]),
nan_to_null=nan_to_null,
)._s
for i in range(n_columns)
]
Expand Down Expand Up @@ -1192,18 +1221,19 @@ def to_frame_chunk(
return (df.rechunk() if n_chunks > 0 else df)._df


@deprecated_alias(nan_to_none="nan_to_null")
def pandas_to_pydf(
data: pd.DataFrame,
schema: SchemaDefinition | None = None,
schema_overrides: SchemaDict | None = None,
rechunk: bool = True,
nan_to_none: bool = True,
nan_to_null: bool = True,
) -> PyDataFrame:
"""Construct a PyDataFrame from a pandas DataFrame."""
length = data.shape[0]
arrow_dict = {
str(col): _pandas_series_to_arrow(
data[col], nan_to_none=nan_to_none, min_len=length
data[col], nan_to_null=nan_to_null, min_len=length
)
for col in data.columns
}
Expand Down
25 changes: 18 additions & 7 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -188,6 +188,9 @@ class DataFrame:
infer_schema_length : int, default None
Maximum number of rows to read for schema inference; only applies if the input
data is a sequence or generator of rows; other input is read as-is.
nan_to_null : bool, default False
If the data comes from one or more numpy arrays, can optionally convert input
data np.nan values to null instead. This is a no-op for all other input data.
Examples
--------
Expand Down Expand Up @@ -324,6 +327,7 @@ def __init__(
schema_overrides: SchemaDict | None = None,
orient: Orientation | None = None,
infer_schema_length: int | None = N_INFER_DEFAULT,
nan_to_null: bool = False,
):
if data is None:
self._df = dict_to_pydf(
Expand All @@ -332,7 +336,10 @@ def __init__(

elif isinstance(data, dict):
self._df = dict_to_pydf(
data, schema=schema, schema_overrides=schema_overrides
data,
schema=schema,
schema_overrides=schema_overrides,
nan_to_null=nan_to_null,
)

elif isinstance(data, (list, tuple, Sequence)):
Expand All @@ -350,7 +357,11 @@ def __init__(

elif _check_for_numpy(data) and isinstance(data, np.ndarray):
self._df = numpy_to_pydf(
data, schema=schema, schema_overrides=schema_overrides, orient=orient
data,
schema=schema,
schema_overrides=schema_overrides,
orient=orient,
nan_to_null=nan_to_null,
)

elif _check_for_pyarrow(data) and isinstance(data, pa.Table):
Expand Down Expand Up @@ -585,14 +596,14 @@ def _from_arrow(
)

@classmethod
@deprecated_alias(columns="schema")
@deprecated_alias(columns="schema", nan_to_none="nan_to_null")
def _from_pandas(
cls: type[DF],
data: pd.DataFrame,
schema: SchemaDefinition | None = None,
schema_overrides: SchemaDict | None = None,
rechunk: bool = True,
nan_to_none: bool = True,
nan_to_null: bool = True,
) -> DF:
"""
Construct a Polars DataFrame from a pandas DataFrame.
Expand All @@ -616,8 +627,8 @@ def _from_pandas(
any dtypes inferred from the columns param will be overridden.
rechunk : bool, default True
Make sure that all data is in contiguous memory.
nan_to_none : bool, default True
If data contains NaN values PyArrow will convert the NaN to None
nan_to_null : bool, default True
If the data contains NaN values they will be converted to null/None.
Returns
-------
Expand All @@ -630,7 +641,7 @@ def _from_pandas(
schema=schema,
schema_overrides=schema_overrides,
rechunk=rechunk,
nan_to_none=nan_to_none,
nan_to_null=nan_to_null,
)
)

Expand Down
8 changes: 5 additions & 3 deletions py-polars/polars/internals/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,6 +75,7 @@
_datetime_to_pl_timestamp,
_is_generator,
_time_to_pl_time,
deprecated_alias,
is_int_sequence,
range_to_series,
range_to_slice,
Expand Down Expand Up @@ -142,7 +143,7 @@ class Series:
Throw error on numeric overflow.
nan_to_null
In case a numpy array is used to create this Series, indicate how to deal
with np.nan values.
with np.nan values. (This parameter is a no-op on non-numpy data).
dtype_if_empty=dtype_if_empty : DataType, default None
If no dtype is specified and values contains None or an empty list,
set the Polars dtype of the Series data. If not specified, Float32 is used.
Expand Down Expand Up @@ -302,12 +303,13 @@ def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = True) -> Serie
return cls._from_pyseries(arrow_to_pyseries(name, values, rechunk))

@classmethod
@deprecated_alias(nan_to_none="nan_to_null")
def _from_pandas(
cls, name: str, values: pd.Series | pd.DatetimeIndex, nan_to_none: bool = True
cls, name: str, values: pd.Series | pd.DatetimeIndex, nan_to_null: bool = True
) -> Series:
"""Construct a Series from a pandas Series or DatetimeIndex."""
return cls._from_pyseries(
pandas_to_pyseries(name, values, nan_to_none=nan_to_none)
pandas_to_pyseries(name, values, nan_to_null=nan_to_null)
)

def _get_ptr(self) -> int:
Expand Down
24 changes: 23 additions & 1 deletion py-polars/tests/unit/test_constructors.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
import pytest

import polars as pl
from polars.testing import assert_frame_equal
from polars.testing import assert_frame_equal, assert_series_equal


def test_init_dict() -> None:
Expand Down Expand Up @@ -268,6 +268,20 @@ def test_init_ndarray(monkeypatch: Any) -> None:
assert df.shape == (2, 1)
assert df.rows() == [([0, 1, 2, 3, 4],), ([5, 6, 7, 8, 9],)]

# numpy arrays containing NaN
df0 = pl.DataFrame(
data={"x": [1.0, 2.5, float("nan")], "y": [4.0, float("nan"), 6.5]},
)
df1 = pl.DataFrame(
data={"x": np.array([1.0, 2.5, np.nan]), "y": np.array([4.0, np.nan, 6.5])},
)
df2 = pl.DataFrame(
data={"x": np.array([1.0, 2.5, np.nan]), "y": np.array([4.0, np.nan, 6.5])},
nan_to_null=True,
)
assert_frame_equal(df0, df1, nans_compare_equal=True)
assert df2.rows() == [(1.0, 4.0), (2.5, None), (None, 6.5)]


def test_init_arrow() -> None:
# Handle unnamed column
Expand Down Expand Up @@ -340,6 +354,14 @@ def test_init_series() -> None:
# nested list
assert pl.Series([[[2, 2]]]).dtype == pl.List(pl.List(pl.Int64))

# numpy data containing NaN values
s0 = pl.Series("n", [1.0, 2.5, float("nan")])
s1 = pl.Series("n", np.array([1.0, 2.5, float("nan")]))
s2 = pl.Series("n", np.array([1.0, 2.5, float("nan")]), nan_to_null=True)

assert_series_equal(s0, s1, nans_compare_equal=True)
assert s2.to_list() == [1.0, 2.5, None]


def test_init_seq_of_seq() -> None:
# List of lists
Expand Down
Loading

0 comments on commit 1f9bb06

Please sign in to comment.