Skip to content

Commit

Permalink
Add struct.explode() method (#8729)
Browse files Browse the repository at this point in the history
Part of #8660. Note that the issue is asking for this feature in _dask-cudf_, which this PR does not implement.

Depends on: #8306

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - https://github.com/brandon-b-miller
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: #8729
  • Loading branch information
shwina authored Jul 20, 2021
1 parent 799f688 commit cdcc91c
Show file tree
Hide file tree
Showing 3 changed files with 70 additions and 2 deletions.
37 changes: 37 additions & 0 deletions python/cudf/cudf/core/column/struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -150,6 +150,8 @@ class StructMethods(ColumnMethods):
Struct methods for Series
"""

_column: StructColumn

def __init__(self, parent=None):
if not is_struct_dtype(parent.dtype):
raise AttributeError(
Expand Down Expand Up @@ -190,3 +192,38 @@ def field(self, key):
return self._return_or_inplace(self._column.children[pos])
else:
return self._return_or_inplace(self._column.children[key])

def explode(self):
"""
Return a DataFrame whose columns are the fields of this struct Series.
Notes
-----
Note that a copy of the columns is made.
Examples
--------
>>> s
0 {'a': 1, 'b': 'x'}
1 {'a': 2, 'b': 'y'}
2 {'a': 3, 'b': 'z'}
3 {'a': 4, 'b': 'a'}
dtype: struct
>>> s.struct.explode()
a b
0 1 x
1 2 y
2 3 z
3 4 a
"""
return cudf.DataFrame._from_data(
cudf.core.column_accessor.ColumnAccessor(
{
name: col.copy(deep=True)
for name, col in zip(
self._column.dtype.fields, self._column.children
)
}
)
)
9 changes: 7 additions & 2 deletions python/cudf/cudf/core/dataframe.py
Original file line number Diff line number Diff line change
Expand Up @@ -7563,18 +7563,23 @@ def to_dict(self, orient="dict", into=dict):
def to_struct(self, name=None):
"""
Return a struct Series composed of the columns of the DataFrame.
Note that no copies of the data are made.
Parameters
----------
name: optional
Name of the resulting Series
Notes
-----
Note that a copy of the columns is made.
"""
col = cudf.core.column.build_struct_column(
names=self._data.names, children=self._data.columns, size=len(self)
)
return cudf.Series._from_data(
cudf.core.column_accessor.ColumnAccessor({name: col}),
cudf.core.column_accessor.ColumnAccessor(
{name: col.copy(deep=True)}
),
index=self.index,
name=name,
)
Expand Down
26 changes: 26 additions & 0 deletions python/cudf/cudf/tests/test_struct.py
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,28 @@ def test_struct_scalar_null():
assert slr.device_value.value is cudf.NA


def test_struct_explode():
s = cudf.Series([], dtype=cudf.StructDtype({}))
expect = cudf.DataFrame({})
assert_eq(expect, s.struct.explode())

s = cudf.Series(
[
{"a": 1, "b": "x"},
{"a": 2, "b": "y"},
{"a": 3, "b": "z"},
{"a": 4, "b": "a"},
]
)
expect = cudf.DataFrame({"a": [1, 2, 3, 4], "b": ["x", "y", "z", "a"]})
got = s.struct.explode()
assert_eq(expect, got)

# check that a copy was made:
got["a"][0] = 5
assert_eq(s.struct.explode(), expect)


def test_dataframe_to_struct():
df = cudf.DataFrame()
expect = cudf.Series(dtype=cudf.StructDtype({}))
Expand All @@ -179,6 +201,10 @@ def test_dataframe_to_struct():
got = df.to_struct()
assert_eq(expect, got)

# check that a copy was made:
df["a"][0] = 5
assert_eq(got, expect)


@pytest.mark.parametrize(
"series, start, end",
Expand Down

0 comments on commit cdcc91c

Please sign in to comment.