Skip to content

Commit

Permalink
Python API for LIstMethods.len() (#7283)
Browse files Browse the repository at this point in the history
Closes #7157 

This PR adds `ListMethods.len()` API that returns an integer column that contains the length for each element in a `ListColumn`.
Example:
```python
>>> s = cudf.Series([[1,2], None, [3]])
>>> s
0    [1, 2]
1      None
2       [3]
dtype: list
>>> s.list.len()
0       2
1    <NA>
2       1
dtype: int32
```

Authors:
  - Michael Wang (@isVoid)
  - Ashwin Srinath (@shwina)

Approvers:
  - Keith Kraus (@kkraus14)
  - @brandon-b-miller

URL: #7283
  • Loading branch information
isVoid authored Feb 4, 2021
1 parent 678235e commit 7fd069e
Show file tree
Hide file tree
Showing 5 changed files with 91 additions and 1 deletion.
9 changes: 9 additions & 0 deletions python/cudf/cudf/_lib/cpp/lists/count_elements.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,9 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view

cdef extern from "cudf/lists/count_elements.hpp" namespace "cudf::lists" nogil:
cdef unique_ptr[column] count_elements(const lists_column_view) except +
1 change: 0 additions & 1 deletion python/cudf/cudf/_lib/cpp/lists/lists_column_view.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@ from cudf._lib.cpp.column.column_view cimport (

cdef extern from "cudf/lists/lists_column_view.hpp" namespace "cudf" nogil:
cdef cppclass lists_column_view(column_view):
lists_column_view() except +
lists_column_view(const column_view& lists_column) except +
column_view parent() except +
column_view offsets() except +
Expand Down
34 changes: 34 additions & 0 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr, shared_ptr, make_shared
from libcpp.utility cimport move

from cudf._lib.cpp.lists.count_elements cimport (
count_elements as cpp_count_elements
)
from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.column.column_view cimport column_view
from cudf._lib.cpp.column.column cimport column

from cudf._lib.column cimport Column


from cudf.core.dtypes import ListDtype


def count_elements(Column col):
if not isinstance(col.dtype, ListDtype):
raise TypeError("col is not a list column.")

# shared_ptr required because lists_column_view has no default
# ctor
cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)
cdef unique_ptr[column] c_result

with nogil:
c_result = move(cpp_count_elements(list_view.get()[0]))

result = Column.from_unique_ptr(move(c_result))
return result
25 changes: 25 additions & 0 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import pyarrow as pa

import cudf
from cudf._lib.lists import count_elements
from cudf.core.buffer import Buffer
from cudf.core.column import ColumnBase, column
from cudf.core.column.methods import ColumnMethodsMixin
Expand Down Expand Up @@ -203,3 +204,27 @@ def leaves(self):
return self._return_or_inplace(
self._column.elements, retain_index=False
)

def len(self):
"""
Computes the length of each element in the Series/Index.
Returns
-------
Series or Index
Examples
--------
>>> s = cudf.Series([[1, 2, 3], None, [4, 5]])
>>> s
0 [1, 2, 3]
1 None
2 [4, 5]
dtype: list
>>> s.list.len()
0 3
1 <NA>
2 2
dtype: int32
"""
return self._return_or_inplace(count_elements(self._column))
23 changes: 23 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
Expand Up @@ -89,3 +89,26 @@ def test_listdtype_hash():
c = cudf.core.dtypes.ListDtype("int32")

assert hash(a) != hash(c)


@pytest.mark.parametrize(
"data",
[
[[]],
[[1, 2, 3], [4, 5]],
[[1, 2, 3], [], [4, 5]],
[[1, 2, 3], None, [4, 5]],
[[None, None], [None]],
[[[[[[1, 2, 3]]]]]],
cudf.Series([[1, 2]]).iloc[0:0],
cudf.Series([None, [1, 2]]).iloc[0:1],
],
)
def test_len(data):
gsr = cudf.Series(data)
psr = gsr.to_pandas()

expect = psr.map(lambda x: len(x) if x is not None else None)
got = gsr.list.len()

assert_eq(expect, got, check_dtype=False)

0 comments on commit 7fd069e

Please sign in to comment.