Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Adds list.unique API #7664

Merged
merged 14 commits into from
Mar 31, 2021
Merged
15 changes: 15 additions & 0 deletions python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd
Original file line number Diff line number Diff line change
@@ -0,0 +1,15 @@
# Copyright (c) 2021, NVIDIA CORPORATION.

from libcpp.memory cimport unique_ptr

from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
from cudf._lib.cpp.column.column cimport column
from cudf._lib.cpp.types cimport null_equality, nan_equality

cdef extern from "cudf/lists/drop_list_duplicates.hpp" \
namespace "cudf::lists" nogil:
cdef unique_ptr[column] drop_list_duplicates(
const lists_column_view lists_column,
null_equality nulls_equal,
nan_equality nans_equal
) except +
4 changes: 4 additions & 0 deletions python/cudf/cudf/_lib/cpp/types.pxd
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,10 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil:
EQUAL "cudf::null_equality::EQUAL"
UNEQUAL "cudf::null_equality::UNEQUAL"

ctypedef enum nan_equality "cudf::nan_equality":
ALL_EQUAL "cudf::nan_equality::ALL_EQUAL"
UNEQUAL "cudf::nan_equality::UNEQUAL"

ctypedef enum type_id "cudf::type_id":
EMPTY "cudf::type_id::EMPTY"
INT8 "cudf::type_id::INT8"
Expand Down
40 changes: 38 additions & 2 deletions python/cudf/cudf/_lib/lists.pyx
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,9 @@ from cudf._lib.cpp.lists.count_elements cimport (
from cudf._lib.cpp.lists.explode cimport (
explode_outer as cpp_explode_outer
)
from cudf._lib.cpp.lists.drop_list_duplicates cimport (
drop_list_duplicates as cpp_drop_list_duplicates
)
from cudf._lib.cpp.lists.sorting cimport (
sort_lists as cpp_sort_lists
)
Expand All @@ -22,7 +25,13 @@ from cudf._lib.cpp.scalar.scalar cimport scalar

from cudf._lib.cpp.table.table cimport table
from cudf._lib.cpp.table.table_view cimport table_view
from cudf._lib.cpp.types cimport size_type, order, null_order
from cudf._lib.cpp.types cimport (
size_type,
null_equality,
order,
null_order,
nan_equality
)

from cudf._lib.column cimport Column
from cudf._lib.table cimport Table
Expand Down Expand Up @@ -71,6 +80,34 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False):
)


def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal):
"""
nans_all_equal == True indicates that libcudf should treat any two elements
from {+nan, -nan} as equal, and as unequal otherwise.
nulls_equal == True indicates that libcudf should treat any two nulls as
equal, and as unequal otherwise.
"""
cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
)
cdef null_equality c_nulls_equal = (
null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL
)
cdef nan_equality c_nans_equal = (
nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL
)

cdef unique_ptr[column] c_result

with nogil:
c_result = move(
cpp_drop_list_duplicates(list_view.get()[0],
c_nulls_equal,
c_nans_equal)
)
return Column.from_unique_ptr(move(c_result))


def sort_lists(Column col, bool ascending, str na_position):
cdef shared_ptr[lists_column_view] list_view = (
make_shared[lists_column_view](col.view())
Expand Down Expand Up @@ -121,6 +158,5 @@ def contains_scalar(Column col, DeviceScalar search_key):
list_view.get()[0],
search_key_value[0],
))

result = Column.from_unique_ptr(move(c_result))
return result
36 changes: 36 additions & 0 deletions python/cudf/cudf/core/column/lists.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from cudf._lib.lists import (
contains_scalar,
count_elements,
drop_list_duplicates,
extract_element,
sort_lists,
)
Expand Down Expand Up @@ -361,6 +362,41 @@ def take(self, lists_indices):
else:
return res

def unique(self):
"""
Returns unique element for each list in the column, order for each
unique element is not guaranteed.

Returns
-------
ListColumn

Examples
--------
>>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []])
>>> s
0 [1.0, 1.0, 2.0, nan, nan]
kkraus14 marked this conversation as resolved.
Show resolved Hide resolved
1 None
2 [4.0, 4.0]
3 []
dtype: list
>>> s.list.unique() # Order of list element is not guaranteed
0 [1.0, 2.0, nan]
isVoid marked this conversation as resolved.
Show resolved Hide resolved
1 None
2 [4.0]
3 []
dtype: list
"""

if is_list_dtype(self._column.children[1].dtype):
raise NotImplementedError("Nested lists unique is not supported.")

return self._return_or_inplace(
drop_list_duplicates(
self._column, nulls_equal=True, nans_all_equal=True
)
)

def sort_values(
self,
ascending=True,
Expand Down
34 changes: 34 additions & 0 deletions python/cudf/cudf/tests/test_list.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,7 @@
# Copyright (c) 2020-2021, NVIDIA CORPORATION.
import functools

import numpy as np
import pandas as pd
import pyarrow as pa
import pytest
Expand Down Expand Up @@ -162,6 +163,39 @@ def test_take_invalid(invalid, exception):
gs.list.take(invalid)


@pytest.mark.parametrize(
("data", "expected"),
[
([[1, 1, 2, 2], [], None, [3, 4, 5]], [[1, 2], [], None, [3, 4, 5]]),
(
[[1.233, np.nan, 1.234, 3.141, np.nan, 1.234]],
[[1.233, 1.234, np.nan, 3.141]],
), # duplicate nans
([[1, 1, 2, 2, None, None]], [[1, 2, None]]), # duplicate nulls
(
[[1.233, np.nan, None, 1.234, 3.141, np.nan, 1.234, None]],
[[1.233, 1.234, np.nan, None, 3.141]],
), # duplicate nans and nulls
([[2, None, 1, None, 2]], [[1, 2, None]]),
([[], []], [[], []]),
([[], None], [[], None]),
],
)
def test_unique(data, expected):
"""
Pandas de-duplicates nans and nulls respectively in Series.unique.
`expected` is setup to mimic such behavior
"""
gs = cudf.Series(data, nan_as_null=False)

got = gs.list.unique()
expected = cudf.Series(expected, nan_as_null=False).list.sort_values()

got = got.list.sort_values()

assert_eq(expected, got)


def key_func_builder(x, na_position):
if x is None:
if na_position == "first":
Expand Down