diff --git a/py-polars/polars/convert.py b/py-polars/polars/convert.py index ac204f72546d..e474ed05f6b2 100644 --- a/py-polars/polars/convert.py +++ b/py-polars/polars/convert.py @@ -392,7 +392,7 @@ def from_arrow( def from_pandas( df: pd.DataFrame, rechunk: bool = True, - nan_to_none: bool = True, + nan_to_null: bool = True, schema_overrides: SchemaDict | None = None, ) -> DataFrame: ... @@ -402,16 +402,17 @@ def from_pandas( def from_pandas( df: pd.Series | pd.DatetimeIndex, rechunk: bool = True, - nan_to_none: bool = True, + nan_to_null: bool = True, schema_overrides: SchemaDict | None = None, ) -> Series: ... +@deprecated_alias(nan_to_none="nan_to_null") def from_pandas( df: pd.DataFrame | pd.Series | pd.DatetimeIndex, rechunk: bool = True, - nan_to_none: bool = True, + nan_to_null: bool = True, schema_overrides: SchemaDict | None = None, ) -> DataFrame | Series: """ @@ -427,7 +428,7 @@ def from_pandas( Data represented as a pandas DataFrame, Series, or DatetimeIndex. rechunk : bool, default True Make sure that all data is in contiguous memory. - nan_to_none : bool, default True + nan_to_null : bool, default True If data contains `NaN` values PyArrow will convert the ``NaN`` to ``None`` schema_overrides : dict, default None Support override of inferred types for one or more columns. @@ -470,12 +471,12 @@ def from_pandas( """ if isinstance(df, (pd.Series, pd.DatetimeIndex)): - return Series._from_pandas("", df, nan_to_none=nan_to_none) + return Series._from_pandas("", df, nan_to_null=nan_to_null) elif isinstance(df, pd.DataFrame): return DataFrame._from_pandas( df, rechunk=rechunk, - nan_to_none=nan_to_none, + nan_to_null=nan_to_null, schema_overrides=schema_overrides, ) else: diff --git a/py-polars/polars/internals/construction.py b/py-polars/polars/internals/construction.py index b60367866d98..6428b7db10c6 100644 --- a/py-polars/polars/internals/construction.py +++ b/py-polars/polars/internals/construction.py @@ -54,7 +54,13 @@ from polars.dependencies import pandas as pd from polars.dependencies import pyarrow as pa from polars.exceptions import ShapeError -from polars.utils import _is_generator, arrlen, range_to_series, threadpool_size +from polars.utils import ( + _is_generator, + arrlen, + deprecated_alias, + range_to_series, + threadpool_size, +) if version_info >= (3, 10): @@ -455,9 +461,10 @@ def sequence_to_pyseries( raise error +@deprecated_alias(nan_to_none="nan_to_null") def _pandas_series_to_arrow( values: pd.Series | pd.DatetimeIndex, - nan_to_none: bool = True, + nan_to_null: bool = True, min_len: int | None = None, ) -> pa.Array: """ @@ -467,7 +474,7 @@ def _pandas_series_to_arrow( ---------- values : :class:`pandas.Series` or :class:`pandas.DatetimeIndex` Series to convert to arrow - nan_to_none : bool, default = True + nan_to_null : bool, default = True Interpret `NaN` as missing values min_len : int, optional in case of null values, this length will be used to create a dummy f64 array @@ -482,12 +489,12 @@ def _pandas_series_to_arrow( if dtype == "object": first_non_none = _get_first_non_none(values.values) # type: ignore[arg-type] if isinstance(first_non_none, str): - return pa.array(values, pa.large_utf8(), from_pandas=nan_to_none) + return pa.array(values, pa.large_utf8(), from_pandas=nan_to_null) elif first_non_none is None: return pa.nulls(min_len, pa.large_utf8()) - return pa.array(values, from_pandas=nan_to_none) + return pa.array(values, from_pandas=nan_to_null) elif dtype: - return pa.array(values, from_pandas=nan_to_none) + return pa.array(values, from_pandas=nan_to_null) else: # Pandas Series is actually a Pandas DataFrame when the original dataframe # contains duplicated columns and a duplicated column is requested with df["a"]. @@ -497,15 +504,16 @@ def _pandas_series_to_arrow( ) +@deprecated_alias(nan_to_none="nan_to_null") def pandas_to_pyseries( - name: str, values: pd.Series | pd.DatetimeIndex, nan_to_none: bool = True + name: str, values: pd.Series | pd.DatetimeIndex, nan_to_null: bool = True ) -> PySeries: """Construct a PySeries from a pandas Series or DatetimeIndex.""" # TODO: Change `if not name` to `if name is not None` once name is Optional[str] if not name and values.name is not None: name = str(values.name) return arrow_to_pyseries( - name, _pandas_series_to_arrow(values, nan_to_none=nan_to_none) + name, _pandas_series_to_arrow(values, nan_to_null=nan_to_null) ) @@ -620,8 +628,9 @@ def _expand_dict_scalars( data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | pli.Series], schema_overrides: SchemaDict | None = None, order: Sequence[str] | None = None, + nan_to_null: bool = False, ) -> dict[str, pli.Series]: - """Expand scalar values in dict data (propagate literal as array).""" + """Expand any scalar values in dict data (propagate literal as array).""" updated_data = {} if data: dtypes = schema_overrides or {} @@ -633,7 +642,9 @@ def _expand_dict_scalars( updated_data[name] = pli.DataFrame(val).to_struct(name) elif arrlen(val) is not None or _is_generator(val): - updated_data[name] = pli.Series(name=name, values=val, dtype=dtype) + updated_data[name] = pli.Series( + name=name, values=val, dtype=dtype, nan_to_null=nan_to_null + ) elif val is None or isinstance( # type: ignore[redundant-expr] val, (int, float, str, bool) @@ -668,6 +679,7 @@ def dict_to_pydf( data: Mapping[str, Sequence[object] | Mapping[str, Sequence[object]] | pli.Series], schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, + nan_to_null: bool = False, ) -> PyDataFrame: """Construct a PyDataFrame from a dictionary of sequences.""" if not schema: @@ -686,11 +698,17 @@ def dict_to_pydf( ) if not data and schema_overrides: data_series = [ - pli.Series(name, [], schema_overrides.get(name))._s for name in columns + pli.Series( + name, [], dtype=schema_overrides.get(name), nan_to_null=nan_to_null + )._s + for name in columns ] else: data_series = [ - s._s for s in _expand_dict_scalars(data, schema_overrides).values() + s._s + for s in _expand_dict_scalars( + data, schema_overrides, nan_to_null=nan_to_null + ).values() ] data_series = _handle_columns_arg(data_series, columns=columns, from_dict=True) @@ -921,6 +939,7 @@ def numpy_to_pydf( schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, orient: Orientation | None = None, + nan_to_null: bool = False, ) -> PyDataFrame: """Construct a PyDataFrame from a numpy ndarray.""" shape = data.shape @@ -974,21 +993,31 @@ def numpy_to_pydf( elif len(shape) == 1: data_series = [ - pli.Series(column_names[0], data, schema_overrides.get(column_names[0]))._s + pli.Series( + name=column_names[0], + values=data, + dtype=schema_overrides.get(column_names[0]), + nan_to_null=nan_to_null, + )._s ] - else: if orient == "row": data_series = [ pli.Series( - column_names[i], data[:, i], schema_overrides.get(column_names[i]) + name=column_names[i], + values=data[:, i], + dtype=schema_overrides.get(column_names[i]), + nan_to_null=nan_to_null, )._s for i in range(n_columns) ] else: data_series = [ pli.Series( - column_names[i], data[i], schema_overrides.get(column_names[i]) + name=column_names[i], + values=data[i], + dtype=schema_overrides.get(column_names[i]), + nan_to_null=nan_to_null, )._s for i in range(n_columns) ] @@ -1192,18 +1221,19 @@ def to_frame_chunk( return (df.rechunk() if n_chunks > 0 else df)._df +@deprecated_alias(nan_to_none="nan_to_null") def pandas_to_pydf( data: pd.DataFrame, schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, rechunk: bool = True, - nan_to_none: bool = True, + nan_to_null: bool = True, ) -> PyDataFrame: """Construct a PyDataFrame from a pandas DataFrame.""" length = data.shape[0] arrow_dict = { str(col): _pandas_series_to_arrow( - data[col], nan_to_none=nan_to_none, min_len=length + data[col], nan_to_null=nan_to_null, min_len=length ) for col in data.columns } diff --git a/py-polars/polars/internals/dataframe/frame.py b/py-polars/polars/internals/dataframe/frame.py index 7b77eb9589ec..9e2327a106c2 100644 --- a/py-polars/polars/internals/dataframe/frame.py +++ b/py-polars/polars/internals/dataframe/frame.py @@ -188,6 +188,9 @@ class DataFrame: infer_schema_length : int, default None Maximum number of rows to read for schema inference; only applies if the input data is a sequence or generator of rows; other input is read as-is. + nan_to_null : bool, default False + If the data comes from one or more numpy arrays, can optionally convert input + data np.nan values to null instead. This is a no-op for all other input data. Examples -------- @@ -324,6 +327,7 @@ def __init__( schema_overrides: SchemaDict | None = None, orient: Orientation | None = None, infer_schema_length: int | None = N_INFER_DEFAULT, + nan_to_null: bool = False, ): if data is None: self._df = dict_to_pydf( @@ -332,7 +336,10 @@ def __init__( elif isinstance(data, dict): self._df = dict_to_pydf( - data, schema=schema, schema_overrides=schema_overrides + data, + schema=schema, + schema_overrides=schema_overrides, + nan_to_null=nan_to_null, ) elif isinstance(data, (list, tuple, Sequence)): @@ -350,7 +357,11 @@ def __init__( elif _check_for_numpy(data) and isinstance(data, np.ndarray): self._df = numpy_to_pydf( - data, schema=schema, schema_overrides=schema_overrides, orient=orient + data, + schema=schema, + schema_overrides=schema_overrides, + orient=orient, + nan_to_null=nan_to_null, ) elif _check_for_pyarrow(data) and isinstance(data, pa.Table): @@ -585,14 +596,14 @@ def _from_arrow( ) @classmethod - @deprecated_alias(columns="schema") + @deprecated_alias(columns="schema", nan_to_none="nan_to_null") def _from_pandas( cls: type[DF], data: pd.DataFrame, schema: SchemaDefinition | None = None, schema_overrides: SchemaDict | None = None, rechunk: bool = True, - nan_to_none: bool = True, + nan_to_null: bool = True, ) -> DF: """ Construct a Polars DataFrame from a pandas DataFrame. @@ -616,8 +627,8 @@ def _from_pandas( any dtypes inferred from the columns param will be overridden. rechunk : bool, default True Make sure that all data is in contiguous memory. - nan_to_none : bool, default True - If data contains NaN values PyArrow will convert the NaN to None + nan_to_null : bool, default True + If the data contains NaN values they will be converted to null/None. Returns ------- @@ -630,7 +641,7 @@ def _from_pandas( schema=schema, schema_overrides=schema_overrides, rechunk=rechunk, - nan_to_none=nan_to_none, + nan_to_null=nan_to_null, ) ) diff --git a/py-polars/polars/internals/series/series.py b/py-polars/polars/internals/series/series.py index f22a057bedc2..b0c6d80ac14e 100644 --- a/py-polars/polars/internals/series/series.py +++ b/py-polars/polars/internals/series/series.py @@ -75,6 +75,7 @@ _datetime_to_pl_timestamp, _is_generator, _time_to_pl_time, + deprecated_alias, is_int_sequence, range_to_series, range_to_slice, @@ -142,7 +143,7 @@ class Series: Throw error on numeric overflow. nan_to_null In case a numpy array is used to create this Series, indicate how to deal - with np.nan values. + with np.nan values. (This parameter is a no-op on non-numpy data). dtype_if_empty=dtype_if_empty : DataType, default None If no dtype is specified and values contains None or an empty list, set the Polars dtype of the Series data. If not specified, Float32 is used. @@ -302,12 +303,13 @@ def _from_arrow(cls, name: str, values: pa.Array, rechunk: bool = True) -> Serie return cls._from_pyseries(arrow_to_pyseries(name, values, rechunk)) @classmethod + @deprecated_alias(nan_to_none="nan_to_null") def _from_pandas( - cls, name: str, values: pd.Series | pd.DatetimeIndex, nan_to_none: bool = True + cls, name: str, values: pd.Series | pd.DatetimeIndex, nan_to_null: bool = True ) -> Series: """Construct a Series from a pandas Series or DatetimeIndex.""" return cls._from_pyseries( - pandas_to_pyseries(name, values, nan_to_none=nan_to_none) + pandas_to_pyseries(name, values, nan_to_null=nan_to_null) ) def _get_ptr(self) -> int: diff --git a/py-polars/tests/unit/test_constructors.py b/py-polars/tests/unit/test_constructors.py index 94fe9a2ab10d..8f990c6555b8 100644 --- a/py-polars/tests/unit/test_constructors.py +++ b/py-polars/tests/unit/test_constructors.py @@ -12,7 +12,7 @@ import pytest import polars as pl -from polars.testing import assert_frame_equal +from polars.testing import assert_frame_equal, assert_series_equal def test_init_dict() -> None: @@ -268,6 +268,20 @@ def test_init_ndarray(monkeypatch: Any) -> None: assert df.shape == (2, 1) assert df.rows() == [([0, 1, 2, 3, 4],), ([5, 6, 7, 8, 9],)] + # numpy arrays containing NaN + df0 = pl.DataFrame( + data={"x": [1.0, 2.5, float("nan")], "y": [4.0, float("nan"), 6.5]}, + ) + df1 = pl.DataFrame( + data={"x": np.array([1.0, 2.5, np.nan]), "y": np.array([4.0, np.nan, 6.5])}, + ) + df2 = pl.DataFrame( + data={"x": np.array([1.0, 2.5, np.nan]), "y": np.array([4.0, np.nan, 6.5])}, + nan_to_null=True, + ) + assert_frame_equal(df0, df1, nans_compare_equal=True) + assert df2.rows() == [(1.0, 4.0), (2.5, None), (None, 6.5)] + def test_init_arrow() -> None: # Handle unnamed column @@ -340,6 +354,14 @@ def test_init_series() -> None: # nested list assert pl.Series([[[2, 2]]]).dtype == pl.List(pl.List(pl.Int64)) + # numpy data containing NaN values + s0 = pl.Series("n", [1.0, 2.5, float("nan")]) + s1 = pl.Series("n", np.array([1.0, 2.5, float("nan")])) + s2 = pl.Series("n", np.array([1.0, 2.5, float("nan")]), nan_to_null=True) + + assert_series_equal(s0, s1, nans_compare_equal=True) + assert s2.to_list() == [1.0, 2.5, None] + def test_init_seq_of_seq() -> None: # List of lists diff --git a/py-polars/tests/unit/test_interop.py b/py-polars/tests/unit/test_interop.py index 736dab33d2a3..d5dbe8289fd1 100644 --- a/py-polars/tests/unit/test_interop.py +++ b/py-polars/tests/unit/test_interop.py @@ -127,7 +127,7 @@ def test_from_pandas() -> None: assert out.schema[col] == dtype -def test_from_pandas_nan_to_none() -> None: +def test_from_pandas_nan_to_null() -> None: df = pd.DataFrame( { "bools_nulls": [None, True, False], @@ -138,13 +138,13 @@ def test_from_pandas_nan_to_none() -> None: } ) out_true = pl.from_pandas(df) - out_false = pl.from_pandas(df, nan_to_none=False) + out_false = pl.from_pandas(df, nan_to_null=False) assert all(val is None for val in out_true["nulls"]) assert all(np.isnan(val) for val in out_false["nulls"][1:]) df = pd.Series([2, np.nan, None], name="pd") # type: ignore[assignment] out_true = pl.from_pandas(df) - out_false = pl.from_pandas(df, nan_to_none=False) + out_false = pl.from_pandas(df, nan_to_null=False) assert [val is None for val in out_true] assert [np.isnan(val) for val in out_false[1:]]