Skip to content

Commit

Permalink
feat(python): Add option to use PyArrow backed-extension arrays when …
Browse files Browse the repository at this point in the history
…converting to pandas.

Add "use_pyarrow_extension_array" argument to to pl.Series.to_pandas() and
pl.DataFrame.to_pandas() as from Pandas 1.5.0, pandas Series and pandas
DataFrame columns can be backed by PyArrow arrays. This allow zero copy
operations and preservation of null values in Pandas dataframes.

For big dataframe this can make the conversion to a pandas DataFrame
almost for free, both in conversion time and memory usage:

    %time df_pd = df.pandas()
    CPU times: user 5.18 s, sys: 817 ms, total: 6 s
    Wall time: 5.12 s

    %time df_pd_pa = df.to_pandas(use_pyarrow_extension_array=True)
    CPU times: user 1.63 ms, sys: 71 µs, total: 1.7 ms
    Wall time: 1.57 ms

Preservation of null values in pandas Series:

    >>> s1 = pl.Series("a", [1, 2, 3])
    >>> s1.to_pandas()
    0    1
    1    2
    2    3
    Name: a, dtype: int64
    >>> s1.to_pandas(use_pyarrow_extension_array=True)
    0    1
    1    2
    2    3
    Name: a, dtype: int64[pyarrow]
    >>> s2 = pl.Series("b", [1, 2, None, 4])
    >>> s2.to_pandas()
    0    1.0
    1    2.0
    2    NaN
    3    4.0
    Name: b, dtype: float64
    >>> s2.to_pandas(use_pyarrow_extension_array=True)
    0       1
    1       2
    2    <NA>
    3       4
    Name: b, dtype: int64[pyarrow]
  • Loading branch information
ghuls committed Feb 9, 2023
1 parent b160f53 commit c539d1e
Show file tree
Hide file tree
Showing 4 changed files with 191 additions and 23 deletions.
81 changes: 69 additions & 12 deletions py-polars/polars/internals/dataframe/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -1937,21 +1937,24 @@ def to_numpy(self) -> np.ndarray[Any, Any]:
else:
return out

def to_pandas(
self, *args: Any, date_as_object: bool = False, **kwargs: Any
def to_pandas( # noqa: D417
self, *args: Any, use_pyarrow_extension_array: bool = False, **kwargs: Any
) -> pd.DataFrame:
"""
Cast to a pandas DataFrame.
This requires that :mod:`pandas` and :mod:`pyarrow` are installed.
This operation clones data.
This operation clones data, unless `use_pyarrow_extension_array=True`.
Parameters
----------
args
Arguments will be sent to :meth:`pyarrow.Table.to_pandas`.
date_as_object
Cast dates to objects. If ``False``, convert to ``datetime64[ns]`` dtype.
use_pyarrow_extension_array
Use PyArrow backed-extension arrays instead of numpy arrays for each
column of the pandas DataFrame. This allows zero copy operations and
preservation of nulls values.
Further operations on this pandas DataFrame, might trigger conversion
to NumPy arrays if that operation is not supported by pyarrow compute
functions.
kwargs
Arguments will be sent to :meth:`pyarrow.Table.to_pandas`.
Expand All @@ -1962,21 +1965,75 @@ def to_pandas(
Examples
--------
>>> import pandas
>>> df = pl.DataFrame(
>>> df1 = pl.DataFrame(
... {
... "foo": [1, 2, 3],
... "bar": [6, 7, 8],
... "ham": ["a", "b", "c"],
... }
... )
>>> pandas_df = df.to_pandas()
>>> type(pandas_df)
>>> pandas_df1 = df1.to_pandas()
>>> type(pandas_df1)
<class 'pandas.core.frame.DataFrame'>
>>> pandas_df1.dtypes
foo int64
bar int64
ham object
dtype: object
>>> df2 = pl.DataFrame(
... {
... "foo": [1, 2, None],
... "bar": [6, None, 8],
... "ham": [None, "b", "c"],
... }
... )
>>> pandas_df2 = df2.to_pandas()
>>> pandas_df2
foo bar ham
0 1.0 6.0 None
1 2.0 NaN b
2 NaN 8.0 c
>>> pandas_df2.dtypes
foo float64
bar float64
ham object
dtype: object
>>> pandas_df2_pa = df2.to_pandas(use_pyarrow_extension_array=True)
>>> pandas_df2_pa
foo bar ham
0 1 6 <NA>
1 2 <NA> b
2 <NA> 8 c
>>> pandas_df2_pa.dtypes
foo int64[pyarrow]
bar int64[pyarrow]
ham large_string[pyarrow]
dtype: object
"""
if use_pyarrow_extension_array:
pandas_version_major, pandas_version_minor = (
int(x) for x in pd.__version__.split(".")[0:2]
)
if pandas_version_major == 0 or (
pandas_version_major == 1 and pandas_version_minor < 5
):
raise ModuleNotFoundError(
f'"use_pyarrow_extension_array=True" requires Pandas 1.5.x or higher, found Pandas {pd.__version__}.'
)

"""
record_batches = self._df.to_pandas()
tbl = pa.Table.from_batches(record_batches)
return tbl.to_pandas(*args, date_as_object=date_as_object, **kwargs)
return (
tbl.to_pandas(
self_destruct=True,
split_blocks=True,
types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),
**kwargs,
)
if use_pyarrow_extension_array
else tbl.to_pandas(**kwargs)
)

def to_series(self, index: int = 0) -> pli.Series:
"""
Expand Down
66 changes: 61 additions & 5 deletions py-polars/polars/internals/series/series.py
Original file line number Diff line number Diff line change
Expand Up @@ -2822,21 +2822,77 @@ def to_arrow(self) -> pa.Array:
"""
return self._s.to_arrow()

def to_pandas(self) -> pd.Series:
def to_pandas( # noqa: D417
self, *args: Any, use_pyarrow_extension_array: bool = False, **kwargs: Any
) -> pd.Series:
"""
Convert this Series to a pandas Series.
This requires that :mod:`pandas` and :mod:`pyarrow` are installed.
This operation clones data, unless `use_pyarrow_extension_array=True`.
Parameters
----------
use_pyarrow_extension_array
Further operations on this Pandas series, might trigger conversion to numpy.
Use PyArrow backed-extension array instead of numpy array for pandas
Series. This allows zero copy operations and preservation of nulls
values.
Further operations on this pandas Series, might trigger conversion
to NumPy arrays if that operation is not supported by pyarrow compute
functions.
kwargs
Arguments will be sent to :meth:`pyarrow.Table.to_pandas`.
Examples
--------
>>> s = pl.Series("a", [1, 2, 3])
>>> s.to_pandas()
>>> s1 = pl.Series("a", [1, 2, 3])
>>> s1.to_pandas()
0 1
1 2
2 3
Name: a, dtype: int64
>>> s1.to_pandas(use_pyarrow_extension_array=True)
0 1
1 2
2 3
Name: a, dtype: int64[pyarrow]
>>> s2 = pl.Series("b", [1, 2, None, 4])
>>> s2.to_pandas()
0 1.0
1 2.0
2 NaN
3 4.0
Name: b, dtype: float64
>>> s2.to_pandas(use_pyarrow_extension_array=True)
0 1
1 2
2 <NA>
3 4
Name: b, dtype: int64[pyarrow]
"""
if use_pyarrow_extension_array:
pandas_version_major, pandas_version_minor = (
int(x) for x in pd.__version__.split(".")[0:2]
)
if pandas_version_major == 0 or (
pandas_version_major == 1 and pandas_version_minor < 5
):
raise ModuleNotFoundError(
f'"use_pyarrow_extension_array=True" requires Pandas 1.5.x or higher, found Pandas {pd.__version__}.'
)

"""
pd_series = self.to_arrow().to_pandas()
pd_series = (
self.to_arrow().to_pandas(
self_destruct=True,
split_blocks=True,
types_mapper=lambda pa_dtype: pd.ArrowDtype(pa_dtype),
**kwargs,
)
if use_pyarrow_extension_array
else self.to_arrow().to_pandas(**kwargs)
)
pd_series.name = self.name
return pd_series

Expand Down
49 changes: 47 additions & 2 deletions py-polars/tests/unit/test_interop.py
Original file line number Diff line number Diff line change
Expand Up @@ -446,8 +446,53 @@ def test_no_rechunk() -> None:
def test_cat_to_pandas() -> None:
df = pl.DataFrame({"a": ["best", "test"]})
df = df.with_columns(pl.all().cast(pl.Categorical))
out = df.to_pandas()
assert "category" in str(out["a"].dtype)
pd_out = df.to_pandas()
assert "category" in str(pd_out["a"].dtype)

try:
pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)
assert pd_pa_out["a"].dtype.type == pa.DictionaryType
except ModuleNotFoundError:
# Skip test if Pandas 1.5.x is not installed.
pass


def test_to_pandas() -> None:
df = pl.DataFrame(
{"a": [1, 2, 3], "b": [6, None, 8], "c": ["a", "b", "c"], "d": [None, "e", "f"]}
)
df = df.with_columns(
[
pl.col("c").cast(pl.Categorical).alias("e"),
pl.col("d").cast(pl.Categorical).alias("f"),
]
)
pd_out = df.to_pandas()
pd_out_dtypes_expected = [
np.int64,
np.float64,
np.object_,
np.object_,
pd.CategoricalDtype(categories=["a", "b", "c"], ordered=False),
pd.CategoricalDtype(categories=["e", "f"], ordered=False),
]
assert pd_out.dtypes.to_list() == pd_out_dtypes_expected

try:
pd_pa_out = df.to_pandas(use_pyarrow_extension_array=True)
pd_pa_dtypes_names = [dtype.name for dtype in pd_pa_out.dtypes]
pd_pa_dtypes_names_expected = [
"int64[pyarrow]",
"int64[pyarrow]",
"large_string[pyarrow]",
"large_string[pyarrow]",
"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",
"dictionary<values=large_string, indices=int64, ordered=0>[pyarrow]",
]
assert pd_pa_dtypes_names == pd_pa_dtypes_names_expected
except ModuleNotFoundError:
# Skip test if Pandas 1.5.x is not installed.
pass


def test_numpy_to_lit() -> None:
Expand Down
18 changes: 14 additions & 4 deletions py-polars/tests/unit/test_series.py
Original file line number Diff line number Diff line change
Expand Up @@ -470,11 +470,21 @@ def test_to_pandas() -> None:
assert b.isnull().sum() == 1

if a.dtype == pl.List:
vals = [(None if x is None else x.tolist()) for x in b]
vals_b = [(None if x is None else x.tolist()) for x in b]
else:
vals = b.replace({np.nan: None}).values.tolist() # type: ignore[union-attr]

assert vals == test_data
vals_b = b.replace({np.nan: None}).values.tolist() # type: ignore[union-attr]

assert vals_b == test_data

try:
c = a.to_pandas(use_pyarrow_extension_array=True)
assert a.name == c.name
assert c.isnull().sum() == 1
vals_c = [None if x is pd.NA else x for x in c.tolist()]
assert vals_c == test_data
except ModuleNotFoundError:
# Skip test if Pandas 1.5.x is not installed.
pass


def test_to_python() -> None:
Expand Down

0 comments on commit c539d1e

Please sign in to comment.