diff --git a/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd b/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd new file mode 100644 index 00000000000..40b1836f932 --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd @@ -0,0 +1,15 @@ +# Copyright (c) 2021, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr + +from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.types cimport null_equality, nan_equality + +cdef extern from "cudf/lists/drop_list_duplicates.hpp" \ + namespace "cudf::lists" nogil: + cdef unique_ptr[column] drop_list_duplicates( + const lists_column_view lists_column, + null_equality nulls_equal, + nan_equality nans_equal + ) except + diff --git a/python/cudf/cudf/_lib/cpp/types.pxd b/python/cudf/cudf/_lib/cpp/types.pxd index bd1108b2cdf..1f2094b3958 100644 --- a/python/cudf/cudf/_lib/cpp/types.pxd +++ b/python/cudf/cudf/_lib/cpp/types.pxd @@ -46,6 +46,10 @@ cdef extern from "cudf/types.hpp" namespace "cudf" nogil: EQUAL "cudf::null_equality::EQUAL" UNEQUAL "cudf::null_equality::UNEQUAL" + ctypedef enum nan_equality "cudf::nan_equality": + ALL_EQUAL "cudf::nan_equality::ALL_EQUAL" + UNEQUAL "cudf::nan_equality::UNEQUAL" + ctypedef enum type_id "cudf::type_id": EMPTY "cudf::type_id::EMPTY" INT8 "cudf::type_id::INT8" diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 7f745e58c67..e93cba20f65 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -10,6 +10,9 @@ from cudf._lib.cpp.lists.count_elements cimport ( from cudf._lib.cpp.lists.explode cimport ( explode_outer as cpp_explode_outer ) +from cudf._lib.cpp.lists.drop_list_duplicates cimport ( + drop_list_duplicates as cpp_drop_list_duplicates +) from cudf._lib.cpp.lists.sorting cimport ( sort_lists as cpp_sort_lists ) @@ -22,7 +25,13 @@ from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view -from cudf._lib.cpp.types cimport size_type, order, null_order +from cudf._lib.cpp.types cimport ( + size_type, + null_equality, + order, + null_order, + nan_equality +) from cudf._lib.column cimport Column from cudf._lib.table cimport Table @@ -71,6 +80,34 @@ def explode_outer(Table tbl, int explode_column_idx, bool ignore_index=False): ) +def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal): + """ + nans_all_equal == True indicates that libcudf should treat any two elements + from {+nan, -nan} as equal, and as unequal otherwise. + nulls_equal == True indicates that libcudf should treat any two nulls as + equal, and as unequal otherwise. + """ + cdef shared_ptr[lists_column_view] list_view = ( + make_shared[lists_column_view](col.view()) + ) + cdef null_equality c_nulls_equal = ( + null_equality.EQUAL if nulls_equal else null_equality.UNEQUAL + ) + cdef nan_equality c_nans_equal = ( + nan_equality.ALL_EQUAL if nans_all_equal else nan_equality.UNEQUAL + ) + + cdef unique_ptr[column] c_result + + with nogil: + c_result = move( + cpp_drop_list_duplicates(list_view.get()[0], + c_nulls_equal, + c_nans_equal) + ) + return Column.from_unique_ptr(move(c_result)) + + def sort_lists(Column col, bool ascending, str na_position): cdef shared_ptr[lists_column_view] list_view = ( make_shared[lists_column_view](col.view()) @@ -121,6 +158,5 @@ def contains_scalar(Column col, DeviceScalar search_key): list_view.get()[0], search_key_value[0], )) - result = Column.from_unique_ptr(move(c_result)) return result diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index b7f34e8c007..364675cd035 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -10,6 +10,7 @@ from cudf._lib.lists import ( contains_scalar, count_elements, + drop_list_duplicates, extract_element, sort_lists, ) @@ -361,6 +362,41 @@ def take(self, lists_indices): else: return res + def unique(self): + """ + Returns unique element for each list in the column, order for each + unique element is not guaranteed. + + Returns + ------- + ListColumn + + Examples + -------- + >>> s = cudf.Series([[1, 1, 2, None, None], None, [4, 4], []]) + >>> s + 0 [1.0, 1.0, 2.0, nan, nan] + 1 None + 2 [4.0, 4.0] + 3 [] + dtype: list + >>> s.list.unique() # Order of list element is not guaranteed + 0 [1.0, 2.0, nan] + 1 None + 2 [4.0] + 3 [] + dtype: list + """ + + if is_list_dtype(self._column.children[1].dtype): + raise NotImplementedError("Nested lists unique is not supported.") + + return self._return_or_inplace( + drop_list_duplicates( + self._column, nulls_equal=True, nans_all_equal=True + ) + ) + def sort_values( self, ascending=True, diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py index 5645ce60596..9906600304b 100644 --- a/python/cudf/cudf/tests/test_list.py +++ b/python/cudf/cudf/tests/test_list.py @@ -1,6 +1,7 @@ # Copyright (c) 2020-2021, NVIDIA CORPORATION. import functools +import numpy as np import pandas as pd import pyarrow as pa import pytest @@ -162,6 +163,39 @@ def test_take_invalid(invalid, exception): gs.list.take(invalid) +@pytest.mark.parametrize( + ("data", "expected"), + [ + ([[1, 1, 2, 2], [], None, [3, 4, 5]], [[1, 2], [], None, [3, 4, 5]]), + ( + [[1.233, np.nan, 1.234, 3.141, np.nan, 1.234]], + [[1.233, 1.234, np.nan, 3.141]], + ), # duplicate nans + ([[1, 1, 2, 2, None, None]], [[1, 2, None]]), # duplicate nulls + ( + [[1.233, np.nan, None, 1.234, 3.141, np.nan, 1.234, None]], + [[1.233, 1.234, np.nan, None, 3.141]], + ), # duplicate nans and nulls + ([[2, None, 1, None, 2]], [[1, 2, None]]), + ([[], []], [[], []]), + ([[], None], [[], None]), + ], +) +def test_unique(data, expected): + """ + Pandas de-duplicates nans and nulls respectively in Series.unique. + `expected` is setup to mimic such behavior + """ + gs = cudf.Series(data, nan_as_null=False) + + got = gs.list.unique() + expected = cudf.Series(expected, nan_as_null=False).list.sort_values() + + got = got.list.sort_values() + + assert_eq(expected, got) + + def key_func_builder(x, na_position): if x is None: if na_position == "first":