Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fix DataFrame.drop(columns=cudf.Series/Index, axis=1) #16712

Merged
merged 5 commits into from
Sep 25, 2024
Merged
Show file tree
Hide file tree
Changes from 2 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 9 additions & 5 deletions python/cudf/cudf/core/frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -764,11 +764,15 @@ def fillna(
)

@_performance_tracking
def _drop_column(self, name):
"""Drop a column by *name*"""
if name not in self._data:
raise KeyError(f"column '{name}' does not exist")
del self._data[name]
def _drop_column(
self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise"
) -> None:
"""Drop a column by *name* inplace."""
try:
del self._data[name]
except KeyError as err:
if errors != "ignore":
raise KeyError(f"column '{name}' does not exist") from err

@_performance_tracking
def _quantile_table(
Expand Down
32 changes: 10 additions & 22 deletions python/cudf/cudf/core/indexed_frame.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,6 @@

from __future__ import annotations

import numbers
import operator
import textwrap
import warnings
Expand Down Expand Up @@ -150,24 +149,14 @@
)


def _get_host_unique(array):
def _get_unique_drop_labels(array):
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What's the benefit of this being a generator? You could just return an iterable rather than yield from it if that makes sense.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Probably negligible in the context of .drop, but it was to avoid a case where array was a scalar so we were converting scalar -> iterable (_get_unique_drop_labels) -> scalar (frame._drop_column(scalar)). I can change it back to make this _get_unique_drop_labels return an iterable if preferred.

Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I’ll leave the choice to you. Just noting that yield from patterns tend to be dangerous for performance in cudf because host-device data copying is often involved.

"""Return labels to be dropped for IndexFrame.drop."""
if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)):
return array.unique.to_pandas()
elif isinstance(array, (str, numbers.Number)):
return [array]
yield from as_column(array).unique().values_host
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Performance question: Do we want to run the unique() on GPU? These are columns and not rows, right? Kernel launch latency may exceed the time to run that unique step on CPU, if we expect this to be small.

I'm okay with running it on GPU if there's any uncertainty or if there's case-by-case decisions/tradeoffs we would need to consider, just want to be sure we're not making a uniformly bad performance decision.

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

These are columns and not rows, right?

Yeah these are just column labels to drop that happen to be GPU backed.

I agree it might be worth doing this on the CPU instead. I'd assume len(columns to drop) << len(columns) and we convert to host anyways to iterate over these labels to drop, so we might as well do the unique step there too

elif is_scalar(array):
yield array
else:
return set(array)


def _drop_columns(f: Frame, columns: abc.Iterable, errors: str):
for c in columns:
try:
f._drop_column(c)
except KeyError as e:
if errors == "ignore":
pass
else:
raise e
yield from set(array)


def _indices_from_labels(obj, labels):
Expand Down Expand Up @@ -5261,15 +5250,14 @@ def drop(
out = self.copy()

if axis in (1, "columns"):
target = _get_host_unique(target)

_drop_columns(out, target, errors)
for label in _get_unique_drop_labels(target):
out._drop_column(label, errors=errors)
elif axis in (0, "index"):
dropped = _drop_rows_by_labels(out, target, level, errors)

if columns is not None:
columns = _get_host_unique(columns)
_drop_columns(dropped, columns, errors)
for label in _get_unique_drop_labels(columns):
dropped._drop_column(label, errors=errors)

out._mimic_inplace(dropped, inplace=True)

Expand Down
11 changes: 11 additions & 0 deletions python/cudf/cudf/tests/test_dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,17 @@ def test_dataframe_drop_columns(pdf, columns, inplace):
assert_eq(expected, actual)


@pytest.mark.parametrize("obj", ["Index", "Series"])
def test_drop_cudf_obj_columns(obj):
pdf = pd.DataFrame({"A": [1], "B": [1]})
gdf = cudf.from_pandas(pdf)

columns = ["B"]
expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1)
actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1)
assert_eq(expected, actual)


@pytest.mark.parametrize(
"pdf",
[
Expand Down
Loading