Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

feat: make pandas and NumPy optional dependencies, don't require PyArrow for plotting with Polars/Modin/cuDF #3452

Merged
merged 24 commits into from
Jul 15, 2024
Merged
Show file tree
Hide file tree
Changes from 1 commit
Commits
Show all changes
24 commits
Select commit Hold shift + click to select a range
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Prev Previous commit
Next Next commit
wip
  • Loading branch information
MarcoGorelli committed Jul 14, 2024
commit 110f848abcd5bee351d0da9b18275ff6d98cfb7e
33 changes: 18 additions & 15 deletions altair/utils/core.py
Original file line number Diff line number Diff line change
Expand Up @@ -468,27 +468,29 @@ def sanitize_narwhals_dataframe(
return data.select(columns)


def narwhalify(data: DataType) -> nw.DataFrame[Any]:
def to_eager_narwhals_dataframe(data: DataType) -> nw.DataFrame[Any]:
"""Wrap `data` in `narwhals.DataFrame`.

If `data` is not supported by Narwhals, but it is convertible
to a PyArrow table, then first convert to a PyArrow Table,
and then wrap in `narwhals.DataFrame`.
"""
if isinstance(data, nw.DataFrame):
# Early return if already a Narwhals DataFrame
return data
# Using `strict=False` will return `data` as-is if the object cannot be converted.
data = nw.from_native(data, eager_only=True, strict=False)
if isinstance(data, nw.DataFrame):
return data
if isinstance(data, DataFrameLike):
data_nw = nw.from_native(data)
if nw.get_level(data_nw) == 'metadata':
# If Narwhals' support for `data`'s class is only metadata-level, then we
# use the interchange protocol to convert to a PyArrow Table.
from altair.utils.data import arrow_table_from_dfi_dataframe

pa_table = arrow_table_from_dfi_dataframe(data)
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Out of curiosity, can polars import from the DataFrame interchange protocol without going through pyarrow? If so, it might be nice to do this when polars is installed but not pyarrow (e.g. if someone somehow manages to have ibis and polars installed but not pyarrow).

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It is possible, although pyarrow is a required dependency of ibis anyway

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Not a priority now (and I can't think of any specific scenarios that would benefit), was just curious.

For posterity, this ibis team is in the process of removing the hard pyarrow dependency for some use cases (ibis-project/ibis#9552), but it will still be required for many (all?) specific backends I think.

Copy link
Contributor Author

@MarcoGorelli MarcoGorelli Jul 15, 2024

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

niiice! looking forwards to being able to use ibis as a duckdb frontend without pandas+pyarrow being required 🤞 well done to ibis devs

return nw.from_native(pa_table, eager_only=True)
msg = f"Unsupported data type: {type(data)}"
raise TypeError(msg)
data_nw = nw.from_native(pa_table, eager_only=True)
elif isinstance(data_nw, nw.LazyFrame):
msg = (
"Lazy objects which do not implement the dataframe interchange protocol "
"are not supported. Please collect your lazy object into an eager one "
"first."
)
raise NotImplementedError(msg)

return data_nw


def parse_shorthand(
Expand Down Expand Up @@ -636,8 +638,9 @@ def parse_shorthand(
# if data is specified and type is not, infer type from data
if "type" not in attrs and is_data_type(data):
unescaped_field = attrs["field"].replace("\\", "")
data_nw = narwhalify(data)
if unescaped_field in data_nw.columns:
data_nw = nw.from_native(data)
schema = data_nw.schema
if unescaped_field in schema:
column = data_nw[unescaped_field]
if column.dtype in {nw.Object, nw.Unknown} and _is_pandas_dataframe(
nw.to_native(data_nw)
Expand Down
20 changes: 6 additions & 14 deletions altair/utils/data.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,6 @@
sanitize_pandas_dataframe,
DataFrameLike,
sanitize_narwhals_dataframe,
narwhalify,
)
from .core import sanitize_geo_interface
from .plugin_registry import PluginRegistry
Expand Down Expand Up @@ -150,8 +149,8 @@ def raise_max_rows_error():
else:
return data
else:
data = narwhalify(data)
values = data
from altair.utils.core import to_eager_narwhals_dataframe
values = to_eager_narwhals_dataframe(data)

if max_rows is not None and len(values) > max_rows:
raise_max_rows_error()
Expand Down Expand Up @@ -196,10 +195,6 @@ def sample(
# Maybe this should raise an error or return something useful?
return None
data = narwhalify(data)
if not isinstance(data, nw.DataFrame):
# Maybe this should raise an error or return something useful? Currently,
# if data is of type SupportsGeoInterface it lands here
return None
if not n:
if frac is None:
msg = "frac cannot be None if n is None with this data input type"
Expand Down Expand Up @@ -333,13 +328,10 @@ def to_values(data: DataType) -> ToValuesReturnType:
msg = "values expected in data dict, but not present."
raise KeyError(msg)
return data_native
elif isinstance(data, nw.DataFrame):
data = sanitize_narwhals_dataframe(data)
return {"values": data.rows(named=True)}
else:
# Should never reach this state as tested by check_data_type
msg = f"Unrecognized data type: {type(data)}"
raise ValueError(msg)
from altair.utils.core import to_eager_narwhals_dataframe
data = to_eager_narwhals_dataframe(data)
data = sanitize_narwhals_dataframe(data)
return {"values": data.rows(named=True)}


def check_data_type(data: DataType) -> None:
Expand Down
15 changes: 4 additions & 11 deletions altair/vegalite/v5/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,6 @@
from .schema import core, channels, mixins, Undefined, SCHEMA_URL

from altair.utils import Optional
from altair.utils.data import narwhalify as _narwhalify
from .data import data_transformers
from ... import utils
from ...expr import core as _expr_core
Expand Down Expand Up @@ -1016,17 +1015,11 @@ def to_dict(
# TopLevelMixin instance does not necessarily have copy defined but due to how
# Altair is set up this should hold. Too complex to type hint right now
copy = self.copy(deep=False) # type: ignore[attr-defined]
original_data = getattr(copy, "data", Undefined)
copy.data = _prepare_data(original_data, context)

data = getattr(copy, "data", Undefined)
try:
data = _narwhalify(data) # type: ignore[arg-type]
except TypeError:
# Non-narwhalifiable type still supported by Altair, such as dict.
pass
copy.data = _prepare_data(data, context)

if data is not Undefined:
context["data"] = data
if original_data is not Undefined:
context["data"] = original_data

# remaining to_dict calls are not at top level
context["top_level"] = False
Expand Down