From 1d7e8e09744e457c7ec7a94999cb460bd71647ec Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 24 Jun 2022 14:51:53 -0700 Subject: [PATCH 01/32] Add new implementation and test files --- cpp/CMakeLists.txt | 4 +- .../cudf/lists/detail/stream_compaction.hpp | 47 ++++++ cpp/include/cudf/lists/stream_compaction.hpp | 25 +++ .../apply_boolean_mask.cu | 0 cpp/src/lists/stream_compaction/distinct.cu | 143 ++++++++++++++++++ cpp/src/lists/utilities.cu | 55 +++++++ cpp/src/lists/utilities.hpp | 53 +++++++ cpp/tests/CMakeLists.txt | 3 +- .../apply_boolean_mask_test.cpp | 0 .../lists/stream_compaction/distinct.cpp | 84 ++++++++++ 10 files changed, 412 insertions(+), 2 deletions(-) rename cpp/src/lists/{ => stream_compaction}/apply_boolean_mask.cu (100%) create mode 100644 cpp/src/lists/stream_compaction/distinct.cu create mode 100644 cpp/src/lists/utilities.cu create mode 100644 cpp/src/lists/utilities.hpp rename cpp/tests/lists/{ => stream_compaction}/apply_boolean_mask_test.cpp (100%) create mode 100644 cpp/tests/lists/stream_compaction/distinct.cpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 0cb12582d7a..8e9d5ec9809 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -369,7 +369,6 @@ add_library( src/join/mixed_join_size_kernel_nulls.cu src/join/mixed_join_size_kernels_semi.cu src/join/semi_join.cu - src/lists/apply_boolean_mask.cu src/lists/contains.cu src/lists/combine/concatenate_list_elements.cu src/lists/combine/concatenate_rows.cu @@ -387,6 +386,9 @@ add_library( src/lists/lists_column_view.cu src/lists/segmented_sort.cu src/lists/sequences.cu + src/lists/stream_compaction/apply_boolean_mask.cu + src/lists/stream_compaction/distinct.cu + src/lists/utilities.cu src/merge/merge.cu src/partitioning/partitioning.cu src/partitioning/round_robin.cu diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp index 0e9f2ec16c4..6b7202f821f 100644 --- a/cpp/include/cudf/lists/detail/stream_compaction.hpp +++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp @@ -34,4 +34,51 @@ std::unique_ptr apply_boolean_mask( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @copydoc cudf::list::distinct + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr distinct( + lists_column_view const& input, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Remove duplicate list elements from a lists column. + * + * The input lists column is not given to this function directly. Instead, its child column and a + * label array containing the corresponding list labels for each element are used to access the + * input lists. The output null mask and null count are also provided as the input into this + * function. + * + * This function performs exactly the same as the API + * `cudf::lists::distinct(lists_column_view const& input)` but requires a different set of + * parameters. This is because it is called internally in various APIs where the label array and the + * output null_mask and null_count already exist. + * + * @param n_lists Number of lists in the input and output lists columns + * @param child_labels Array containing labels of the list elements + * @param child The child column of the input lists column + * @param null_mask The null_mask used for constructing the output column + * @param null_count The null_count used for constructing the output column + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned object + * @return A pair of output columns `{out_offsets, out_child}` + */ +std::unique_ptr distinct( + size_type n_lists, + column_view const& child_labels, + column_view const& child, + rmm::device_buffer&& null_mask, + size_type null_count, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace cudf::lists::detail diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp index c7a9731eb65..69f94a2c4bf 100644 --- a/cpp/include/cudf/lists/stream_compaction.hpp +++ b/cpp/include/cudf/lists/stream_compaction.hpp @@ -55,4 +55,29 @@ std::unique_ptr apply_boolean_mask( lists_column_view const& boolean_mask, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief + * + * + * A null input row in any of the input lists columns will result in a null output row. The order of + * elements within each list is not preserved. + * + * Example: + * @code{.pseudo} + * input = { {0, 1, 2, 3, 2}, {1, 2, 3}, null, {4, null, null, 5} } + * result = { {0, 1, 2, 3}, {1, 2, 3}, null, {4, null, 5} } + * @endcode + * + * @param input The input lists column + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal + * @param mr Device memory resource used to allocate the returned object + * @return A column of type BOOL containing the check result + */ +std::unique_ptr distinct( + lists_column_view const& input, + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + } // namespace cudf::lists diff --git a/cpp/src/lists/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu similarity index 100% rename from cpp/src/lists/apply_boolean_mask.cu rename to cpp/src/lists/stream_compaction/apply_boolean_mask.cu diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu new file mode 100644 index 00000000000..1b2c6b6e66f --- /dev/null +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -0,0 +1,143 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf::lists { +namespace detail { + +std::unique_ptr distinct(size_type n_lists, + column_view const& child_labels, + column_view const& child, + rmm::device_buffer&& null_mask, + size_type null_count, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Algorithm: + // - Get indices of distinct rows of the table {labels, child}. + // - Scatter these indices into a marker array that marks if a row will be copied to the output. + // - Collect output rows (with order preserved) using the marker array and build the output + // offsets column. + + auto const input_table = table_view{{child_labels, child}}; + + auto const distinct_indices = cudf::detail::get_distinct_indices( + input_table, duplicate_keep_option::KEEP_ANY, nulls_equal, nans_equal, stream); + + auto const index_markers = [&] { + auto markers = rmm::device_uvector(child.size(), stream); + thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false); + thrust::scatter( + rmm::exec_policy(stream), + thrust::constant_iterator(true, 0), + thrust::constant_iterator(true, static_cast(distinct_indices.size())), + distinct_indices.begin(), + markers.begin()); + return markers; + }(); + + auto const out_table = cudf::detail::copy_if( + input_table, + [index_markers = index_markers.begin()] __device__(auto const idx) { + return index_markers[idx]; + }, + stream, + mr); + auto out_offsets = reconstruct_offsets(out_table->get_column(0).view(), n_lists, stream, mr); + + return make_lists_column(n_lists, + std::move(out_offsets), + std::move(out_table->release().back()), + null_count, + std::move(null_mask), + stream, + mr); +} + +std::unique_ptr distinct(lists_column_view const& input, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Algorithm: + // - Generate labels for the child elements. + // - Get indices of distinct rows of the table {labels, child}. + // - Scatter these indices into a marker array that marks if a row will be copied to the output. + // - Collect output rows (with order preserved) using the marker array and build the output + // lists column. + + auto const child = input.get_sliced_child(stream); + auto const labels = generate_labels(input, child.size(), stream); + + return distinct(input.size(), + labels->view(), + child, + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + nulls_equal, + nans_equal, + stream, + mr); +} + +} // namespace detail + +std::unique_ptr distinct(lists_column_view const& input, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::distinct(input, nulls_equal, nans_equal, rmm::cuda_stream_default, mr); +} + +} // namespace cudf::lists diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu new file mode 100644 index 00000000000..fb38e3711e5 --- /dev/null +++ b/cpp/src/lists/utilities.cu @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "utilities.hpp" + +#include +#include + +namespace cudf::lists::detail { + +std::unique_ptr generate_labels(lists_column_view const& input, + size_type n_elements, + rmm::cuda_stream_view stream) +{ + auto labels = make_numeric_column( + data_type(type_to_id()), n_elements, cudf::mask_state::UNALLOCATED, stream); + auto const labels_begin = labels->mutable_view().template begin(); + cudf::detail::label_segments( + input.offsets_begin(), input.offsets_end(), labels_begin, labels_begin + n_elements, stream); + return labels; +} + +std::unique_ptr reconstruct_offsets(column_view const& labels, + size_type n_lists, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + +{ + auto out_offsets = make_numeric_column( + data_type{type_to_id()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr); + + auto const labels_begin = labels.template begin(); + auto const offsets_begin = out_offsets->mutable_view().template begin(); + cudf::detail::labels_to_offsets(labels_begin, + labels_begin + labels.size(), + offsets_begin, + offsets_begin + out_offsets->size(), + stream); + return out_offsets; +} + +} // namespace cudf::lists::detail diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp new file mode 100644 index 00000000000..1ca48665a29 --- /dev/null +++ b/cpp/src/lists/utilities.hpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "utilities.hpp" + +#include +#include + +#include + +namespace cudf::lists::detail { + +/** + * @brief Generate list labels for elements in the child column of the input lists column. + * + * @param input The input lists column + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A column containing list labels corresponding to each input list elements + */ +std::unique_ptr generate_labels(lists_column_view const& input, + size_type n_elements, + rmm::cuda_stream_view stream); + +/** + * @brief Reconstruct an offsets column from the input labels array. + * + * @param labels The list labels corresponding to each input list elements + * @param n_lists The number of lists in the input lists column + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned object + * @return The output offsets column + */ +std::unique_ptr reconstruct_offsets(column_view const& labels, + size_type n_lists, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr); + +} // namespace cudf::lists::detail diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 816c5a1c59c..7ce3bd6e0e2 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -471,7 +471,6 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) # * lists tests ---------------------------------------------------------------------------------- ConfigureTest( LISTS_TEST - lists/apply_boolean_mask_test.cpp lists/combine/concatenate_list_elements_tests.cpp lists/combine/concatenate_rows_tests.cpp lists/contains_tests.cpp @@ -481,6 +480,8 @@ ConfigureTest( lists/extract_tests.cpp lists/sequences_tests.cpp lists/sort_lists_tests.cpp + lists/stream_compaction/apply_boolean_mask_test.cpp + lists/stream_compaction/distinct.cpp ) # ################################################################################################## diff --git a/cpp/tests/lists/apply_boolean_mask_test.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_test.cpp similarity index 100% rename from cpp/tests/lists/apply_boolean_mask_test.cpp rename to cpp/tests/lists/stream_compaction/apply_boolean_mask_test.cpp diff --git a/cpp/tests/lists/stream_compaction/distinct.cpp b/cpp/tests/lists/stream_compaction/distinct.cpp new file mode 100644 index 00000000000..5072cc9eff5 --- /dev/null +++ b/cpp/tests/lists/stream_compaction/distinct.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include + +auto constexpr null{0}; // null at current level +// auto constexpr XXX{0}; // null pushed down from parent level +auto constexpr NaN = std::numeric_limits::quiet_NaN(); + +using bools_col = cudf::test::fixed_width_column_wrapper; +// using int32s_col = cudf::test::fixed_width_column_wrapper; +// using floats_col = cudf::test::fixed_width_column_wrapper; +using lists_col = cudf::test::lists_column_wrapper; +// using strings_col = cudf::test::strings_column_wrapper; +// using structs_col = cudf::test::structs_column_wrapper; +using lists_cv = cudf::lists_column_view; + +// using cudf::nan_policy; +// using cudf::null_equality; +// using cudf::null_policy; +// using cudf::test::iterators::no_nulls; +using cudf::test::iterators::null_at; +using cudf::test::iterators::nulls_at; + +namespace { +auto set_op_sorted(cudf::column_view const& lhs, cudf::column_view const& rhs) +{ + auto const results = cudf::lists::set_difference(lists_cv{lhs}, lists_cv{rhs}); + return cudf::lists::sort_lists( + lists_cv{*results}, cudf::order::ASCENDING, cudf::null_order::BEFORE); +} +} // namespace + +struct ListDistinctTest : public cudf::test::BaseFixture { +}; + +template +struct ListDistinctTypedTest : public cudf::test::BaseFixture { +}; + +using TestTypes = cudf::test:: + Concat; + +TYPED_TEST_SUITE(ListDistinctTypedTest, TestTypes); + +TEST_F(ListDistinctTest, TrivialTest) +{ + auto const lhs = lists_col{{lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 0.0}, null_at(6)}, + lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}, + {} /*NULL*/, + lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}}, + null_at(2)}; + auto const rhs = lists_col{{lists_col{{1.0, 0.5, null, 0.0, 0.0, null, NaN}, nulls_at({2, 5})}, + lists_col{{2.0, 1.0, null, 0.0, 0.0, null}, nulls_at({2, 5})}, + lists_col{{2.0, 1.0, null, 0.0, 0.0, null}, nulls_at({2, 5})}, + {} /*NULL*/}, + null_at(3)}; + + auto const results_sorted = set_op_sorted(lhs, rhs); + auto const expected = + lists_col{{lists_col{5.0}, lists_col{5.0, NaN}, lists_col{} /*NULL*/, lists_col{} /*NULL*/}, + nulls_at({2, 3})}; + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results_sorted); +} From 51b80db1a878ff25a01ec527dacd68c2f72726ba Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 24 Jun 2022 15:01:41 -0700 Subject: [PATCH 02/32] Fix compile error --- cpp/src/lists/stream_compaction/distinct.cu | 1 - .../lists/stream_compaction/distinct.cpp | 33 +++++++++---------- 2 files changed, 15 insertions(+), 19 deletions(-) diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu index 1b2c6b6e66f..fdb39674b17 100644 --- a/cpp/src/lists/stream_compaction/distinct.cu +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -26,7 +26,6 @@ #include #include #include -#include #include #include #include diff --git a/cpp/tests/lists/stream_compaction/distinct.cpp b/cpp/tests/lists/stream_compaction/distinct.cpp index 5072cc9eff5..290e295d7da 100644 --- a/cpp/tests/lists/stream_compaction/distinct.cpp +++ b/cpp/tests/lists/stream_compaction/distinct.cpp @@ -20,8 +20,8 @@ #include #include -#include #include +#include auto constexpr null{0}; // null at current level // auto constexpr XXX{0}; // null pushed down from parent level @@ -40,12 +40,12 @@ using lists_cv = cudf::lists_column_view; // using cudf::null_policy; // using cudf::test::iterators::no_nulls; using cudf::test::iterators::null_at; -using cudf::test::iterators::nulls_at; +// using cudf::test::iterators::nulls_at; namespace { -auto set_op_sorted(cudf::column_view const& lhs, cudf::column_view const& rhs) +auto distinct_sorted(cudf::column_view const& input) { - auto const results = cudf::lists::set_difference(lists_cv{lhs}, lists_cv{rhs}); + auto const results = cudf::lists::distinct(lists_cv{input}); return cudf::lists::sort_lists( lists_cv{*results}, cudf::order::ASCENDING, cudf::null_order::BEFORE); } @@ -65,20 +65,17 @@ TYPED_TEST_SUITE(ListDistinctTypedTest, TestTypes); TEST_F(ListDistinctTest, TrivialTest) { - auto const lhs = lists_col{{lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 0.0}, null_at(6)}, - lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}, - {} /*NULL*/, - lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}}, - null_at(2)}; - auto const rhs = lists_col{{lists_col{{1.0, 0.5, null, 0.0, 0.0, null, NaN}, nulls_at({2, 5})}, - lists_col{{2.0, 1.0, null, 0.0, 0.0, null}, nulls_at({2, 5})}, - lists_col{{2.0, 1.0, null, 0.0, 0.0, null}, nulls_at({2, 5})}, - {} /*NULL*/}, - null_at(3)}; + auto const input = lists_col{{lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 0.0}, null_at(6)}, + lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}, + {} /*NULL*/, + lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}}, + null_at(2)}; - auto const results_sorted = set_op_sorted(lhs, rhs); - auto const expected = - lists_col{{lists_col{5.0}, lists_col{5.0, NaN}, lists_col{} /*NULL*/, lists_col{} /*NULL*/}, - nulls_at({2, 3})}; + auto const results_sorted = distinct_sorted(input); + auto const expected = lists_col{{lists_col{{null, 0.0, 5.0, NaN}, null_at(0)}, + lists_col{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}, + lists_col{} /*NULL*/, + lists_col{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}}, + null_at(2)}; CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results_sorted); } From 08a76ad78c0cc4e37dfc43291b84aa9c0c40e5da Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 05:59:02 -0700 Subject: [PATCH 03/32] Rename function --- .../cudf/lists/detail/stream_compaction.hpp | 18 +++++----- cpp/src/lists/stream_compaction/distinct.cu | 36 +++++++++---------- .../lists/stream_compaction/distinct.cpp | 2 +- 3 files changed, 28 insertions(+), 28 deletions(-) diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp index 6b7202f821f..326277acca4 100644 --- a/cpp/include/cudf/lists/detail/stream_compaction.hpp +++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp @@ -47,17 +47,17 @@ std::unique_ptr distinct( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Remove duplicate list elements from a lists column. + * @brief Remove duplicate list elements from a lists column given by its child column and an + * associated list labels column. * * The input lists column is not given to this function directly. Instead, its child column and a - * label array containing the corresponding list labels for each element are used to access the - * input lists. The output null mask and null count are also provided as the input into this - * function. + * label column containing the corresponding list labels for each element are given. The output null + * mask and null count are also provided as input parameters to this function. * - * This function performs exactly the same as the API + * This function generates exactly the same output as the API * `cudf::lists::distinct(lists_column_view const& input)` but requires a different set of - * parameters. This is because it is called internally in various APIs where the label array and the - * output null_mask and null_count already exist. + * input parameters. This is because it is going to be called internally in various other APIs where + * the label column and the output null_mask and null_count already exist. * * @param n_lists Number of lists in the input and output lists columns * @param child_labels Array containing labels of the list elements @@ -68,9 +68,9 @@ std::unique_ptr distinct( * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned object - * @return A pair of output columns `{out_offsets, out_child}` + * @return The final lists column without duplicate list elements */ -std::unique_ptr distinct( +std::unique_ptr distinct_by_labels( size_type n_lists, column_view const& child_labels, column_view const& child, diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu index fdb39674b17..4a4b507f266 100644 --- a/cpp/src/lists/stream_compaction/distinct.cu +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -50,15 +50,15 @@ namespace cudf::lists { namespace detail { -std::unique_ptr distinct(size_type n_lists, - column_view const& child_labels, - column_view const& child, - rmm::device_buffer&& null_mask, - size_type null_count, - null_equality nulls_equal, - nan_equality nans_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) +std::unique_ptr distinct_by_labels(size_type n_lists, + column_view const& child_labels, + column_view const& child, + rmm::device_buffer&& null_mask, + size_type null_count, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { // Algorithm: // - Get indices of distinct rows of the table {labels, child}. @@ -117,15 +117,15 @@ std::unique_ptr distinct(lists_column_view const& input, auto const child = input.get_sliced_child(stream); auto const labels = generate_labels(input, child.size(), stream); - return distinct(input.size(), - labels->view(), - child, - cudf::detail::copy_bitmask(input.parent(), stream, mr), - input.null_count(), - nulls_equal, - nans_equal, - stream, - mr); + return distinct_by_labels(input.size(), + labels->view(), + child, + cudf::detail::copy_bitmask(input.parent(), stream, mr), + input.null_count(), + nulls_equal, + nans_equal, + stream, + mr); } } // namespace detail diff --git a/cpp/tests/lists/stream_compaction/distinct.cpp b/cpp/tests/lists/stream_compaction/distinct.cpp index 290e295d7da..632682f6c8d 100644 --- a/cpp/tests/lists/stream_compaction/distinct.cpp +++ b/cpp/tests/lists/stream_compaction/distinct.cpp @@ -77,5 +77,5 @@ TEST_F(ListDistinctTest, TrivialTest) lists_col{} /*NULL*/, lists_col{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}}, null_at(2)}; - CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results_sorted); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } From 16101f708e9e94e4e3b34913e702588e2c663bc2 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 10:07:22 -0700 Subject: [PATCH 04/32] Implement `cudf::detail::stable_distinct` and `lists::distinct` --- cpp/CMakeLists.txt | 1 + cpp/include/cudf/detail/stream_compaction.hpp | 27 +++++ .../cudf/lists/detail/stream_compaction.hpp | 35 ------ cpp/src/lists/stream_compaction/distinct.cu | 112 ++++-------------- cpp/src/stream_compaction/stable_distinct.cu | 66 +++++++++++ 5 files changed, 118 insertions(+), 123 deletions(-) create mode 100644 cpp/src/stream_compaction/stable_distinct.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 8e9d5ec9809..dacd622b718 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -452,6 +452,7 @@ add_library( src/stream_compaction/distinct_reduce.cu src/stream_compaction/drop_nans.cu src/stream_compaction/drop_nulls.cu + src/stream_compaction/stable_distinct.cu src/stream_compaction/unique.cu src/stream_compaction/unique_count.cu src/strings/attributes.cu diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 8ba7b0cb996..25a92d0e9b5 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -22,6 +22,7 @@ #include #include +#include namespace cudf { namespace detail { @@ -89,6 +90,32 @@ std::unique_ptr distinct( rmm::cuda_stream_view stream = cudf::default_stream_value, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); +/** + * @brief Create a new table without duplicate rows while preserving the original row order. + * + * Given an `input` table_view, each row is copied to the output table (with row order preserved) to + * create a set of distinct rows. If there are duplicate rows, which row to be copied depends on the + * specified value of the `keep` parameter. + * + * This API produces exactly the same set of output rows as `cudf::distinct`. + * + * @param input The input table + * @param keep Get index of any, first, last, or none of the found duplicates + * @param nulls_equal Flag to specify whether null elements should be considered as equal + * @param nans_equal Flag to specify whether NaN elements should be considered as equal + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the returned table + * @return A device_uvector containing the result indices + */ +std::unique_ptr
stable_distinct( + table_view const& input, + std::vector const& keys, + duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY, + null_equality nulls_equal = null_equality::EQUAL, + nan_equality nans_equal = nan_equality::ALL_EQUAL, + rmm::cuda_stream_view stream = cudf::default_stream_value, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + /** * @brief Create a column of indices of all distinct rows in the input table. * diff --git a/cpp/include/cudf/lists/detail/stream_compaction.hpp b/cpp/include/cudf/lists/detail/stream_compaction.hpp index 326277acca4..ba3dbb6594b 100644 --- a/cpp/include/cudf/lists/detail/stream_compaction.hpp +++ b/cpp/include/cudf/lists/detail/stream_compaction.hpp @@ -46,39 +46,4 @@ std::unique_ptr distinct( rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); -/** - * @brief Remove duplicate list elements from a lists column given by its child column and an - * associated list labels column. - * - * The input lists column is not given to this function directly. Instead, its child column and a - * label column containing the corresponding list labels for each element are given. The output null - * mask and null count are also provided as input parameters to this function. - * - * This function generates exactly the same output as the API - * `cudf::lists::distinct(lists_column_view const& input)` but requires a different set of - * input parameters. This is because it is going to be called internally in various other APIs where - * the label column and the output null_mask and null_count already exist. - * - * @param n_lists Number of lists in the input and output lists columns - * @param child_labels Array containing labels of the list elements - * @param child The child column of the input lists column - * @param null_mask The null_mask used for constructing the output column - * @param null_count The null_count used for constructing the output column - * @param nulls_equal Flag to specify whether null elements should be considered as equal - * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal - * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned object - * @return The final lists column without duplicate list elements - */ -std::unique_ptr distinct_by_labels( - size_type n_lists, - column_view const& child_labels, - column_view const& child, - rmm::device_buffer&& null_mask, - size_type null_count, - null_equality nulls_equal, - nan_equality nans_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); - } // namespace cudf::lists::detail diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu index 4a4b507f266..3b71ef01fd5 100644 --- a/cpp/src/lists/stream_compaction/distinct.cu +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -17,90 +17,18 @@ #include #include -#include -#include -#include -#include #include #include -#include #include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include +#include +#include +#include #include -#include -#include - -#include -#include namespace cudf::lists { namespace detail { -std::unique_ptr distinct_by_labels(size_type n_lists, - column_view const& child_labels, - column_view const& child, - rmm::device_buffer&& null_mask, - size_type null_count, - null_equality nulls_equal, - nan_equality nans_equal, - rmm::cuda_stream_view stream, - rmm::mr::device_memory_resource* mr) -{ - // Algorithm: - // - Get indices of distinct rows of the table {labels, child}. - // - Scatter these indices into a marker array that marks if a row will be copied to the output. - // - Collect output rows (with order preserved) using the marker array and build the output - // offsets column. - - auto const input_table = table_view{{child_labels, child}}; - - auto const distinct_indices = cudf::detail::get_distinct_indices( - input_table, duplicate_keep_option::KEEP_ANY, nulls_equal, nans_equal, stream); - - auto const index_markers = [&] { - auto markers = rmm::device_uvector(child.size(), stream); - thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false); - thrust::scatter( - rmm::exec_policy(stream), - thrust::constant_iterator(true, 0), - thrust::constant_iterator(true, static_cast(distinct_indices.size())), - distinct_indices.begin(), - markers.begin()); - return markers; - }(); - - auto const out_table = cudf::detail::copy_if( - input_table, - [index_markers = index_markers.begin()] __device__(auto const idx) { - return index_markers[idx]; - }, - stream, - mr); - auto out_offsets = reconstruct_offsets(out_table->get_column(0).view(), n_lists, stream, mr); - - return make_lists_column(n_lists, - std::move(out_offsets), - std::move(out_table->release().back()), - null_count, - std::move(null_mask), - stream, - mr); -} - std::unique_ptr distinct(lists_column_view const& input, null_equality nulls_equal, nan_equality nans_equal, @@ -109,23 +37,31 @@ std::unique_ptr distinct(lists_column_view const& input, { // Algorithm: // - Generate labels for the child elements. - // - Get indices of distinct rows of the table {labels, child}. - // - Scatter these indices into a marker array that marks if a row will be copied to the output. - // - Collect output rows (with order preserved) using the marker array and build the output - // lists column. + // - Get distinct rows of the table {labels, child} using `cudf::stable_distinct`. + // - Build the output lists column from the output distinct rows above. auto const child = input.get_sliced_child(stream); auto const labels = generate_labels(input, child.size(), stream); - return distinct_by_labels(input.size(), - labels->view(), - child, - cudf::detail::copy_bitmask(input.parent(), stream, mr), - input.null_count(), - nulls_equal, - nans_equal, - stream, - mr); + auto const distinct_table = + cudf::detail::stable_distinct(table_view{{labels->view(), child}}, // input table + std::vector{0, 1}, // keys + duplicate_keep_option::KEEP_ANY, + nulls_equal, + nans_equal, + stream, + mr); + + auto out_offsets = + reconstruct_offsets(distinct_table->get_column(0).view(), input.size(), stream, mr); + + return make_lists_column(input.size(), + std::move(out_offsets), + std::move(distinct_table->release().back()), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + stream, + mr); } } // namespace detail diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu new file mode 100644 index 00000000000..2e1feb5c618 --- /dev/null +++ b/cpp/src/stream_compaction/stable_distinct.cu @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2022, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include + +namespace cudf::detail { + +std::unique_ptr
stable_distinct(table_view const& input, + std::vector const& keys, + duplicate_keep_option keep, + null_equality nulls_equal, + nan_equality nans_equal, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) { + return empty_like(input); + } + + auto const distinct_indices = + get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream); + + // Markers to denote which row to be copied to the output. + auto const index_markers = [&] { + auto markers = rmm::device_uvector(input.num_rows(), stream); + thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false); + thrust::scatter( + rmm::exec_policy(stream), + thrust::constant_iterator(true, 0), + thrust::constant_iterator(true, static_cast(distinct_indices.size())), + distinct_indices.begin(), + markers.begin()); + return markers; + }(); + + return cudf::detail::copy_if( + input, + [index_markers = index_markers.begin()] __device__(auto const idx) { + return index_markers[idx]; + }, + stream, + mr); +} + +} // namespace cudf::detail From 5ec13d6bb42ead9fddf908236c2c7166c2ae513e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 10:08:35 -0700 Subject: [PATCH 05/32] Rewrite doxygen --- cpp/include/cudf/detail/stream_compaction.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 25a92d0e9b5..955744042a3 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -91,7 +91,7 @@ std::unique_ptr
distinct( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Create a new table without duplicate rows while preserving the original row order. + * @brief Create a new table without duplicate rows with the original row order preserved. * * Given an `input` table_view, each row is copied to the output table (with row order preserved) to * create a set of distinct rows. If there are duplicate rows, which row to be copied depends on the From 6c5b738c8004c201c4f1e42c2acc81f10010960b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 10:10:20 -0700 Subject: [PATCH 06/32] Rename variable --- cpp/src/stream_compaction/stable_distinct.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu index 2e1feb5c618..2336e3e8b52 100644 --- a/cpp/src/stream_compaction/stable_distinct.cu +++ b/cpp/src/stream_compaction/stable_distinct.cu @@ -42,7 +42,7 @@ std::unique_ptr
stable_distinct(table_view const& input, get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream); // Markers to denote which row to be copied to the output. - auto const index_markers = [&] { + auto const output_markers = [&] { auto markers = rmm::device_uvector(input.num_rows(), stream); thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false); thrust::scatter( @@ -56,8 +56,8 @@ std::unique_ptr
stable_distinct(table_view const& input, return cudf::detail::copy_if( input, - [index_markers = index_markers.begin()] __device__(auto const idx) { - return index_markers[idx]; + [output_markers = output_markers.begin()] __device__(auto const idx) { + return output_markers[idx]; }, stream, mr); From 5b70eee9484df99a96cf05f05f4089ce5bd0459b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 10:10:49 -0700 Subject: [PATCH 07/32] Rewrite comment --- cpp/src/stream_compaction/stable_distinct.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu index 2336e3e8b52..b863e9fd6ea 100644 --- a/cpp/src/stream_compaction/stable_distinct.cu +++ b/cpp/src/stream_compaction/stable_distinct.cu @@ -41,7 +41,7 @@ std::unique_ptr
stable_distinct(table_view const& input, auto const distinct_indices = get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream); - // Markers to denote which row to be copied to the output. + // Markers to denote which rows to be copied to the output. auto const output_markers = [&] { auto markers = rmm::device_uvector(input.num_rows(), stream); thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false); From 238248d5b06bc6aaa8440cbea7c1218938a0e4df Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 15:08:33 -0700 Subject: [PATCH 08/32] Rename files --- cpp/tests/CMakeLists.txt | 4 ++-- ...ply_boolean_mask_test.cpp => apply_boolean_mask_tests.cpp} | 0 .../stream_compaction/{distinct.cpp => distinct_tests.cpp} | 0 3 files changed, 2 insertions(+), 2 deletions(-) rename cpp/tests/lists/stream_compaction/{apply_boolean_mask_test.cpp => apply_boolean_mask_tests.cpp} (100%) rename cpp/tests/lists/stream_compaction/{distinct.cpp => distinct_tests.cpp} (100%) diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 7ce3bd6e0e2..6d725825a05 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -480,8 +480,8 @@ ConfigureTest( lists/extract_tests.cpp lists/sequences_tests.cpp lists/sort_lists_tests.cpp - lists/stream_compaction/apply_boolean_mask_test.cpp - lists/stream_compaction/distinct.cpp + lists/stream_compaction/apply_boolean_mask_tests.cpp + lists/stream_compaction/distinct_tests.cpp ) # ################################################################################################## diff --git a/cpp/tests/lists/stream_compaction/apply_boolean_mask_test.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp similarity index 100% rename from cpp/tests/lists/stream_compaction/apply_boolean_mask_test.cpp rename to cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp diff --git a/cpp/tests/lists/stream_compaction/distinct.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp similarity index 100% rename from cpp/tests/lists/stream_compaction/distinct.cpp rename to cpp/tests/lists/stream_compaction/distinct_tests.cpp From ba6bf6b037467172dd7f4de8a80239d9fb746731 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 15:35:42 -0700 Subject: [PATCH 09/32] Implement float tests --- .../stream_compaction/distinct_tests.cpp | 94 ++++++++++++++----- 1 file changed, 72 insertions(+), 22 deletions(-) diff --git a/cpp/tests/lists/stream_compaction/distinct_tests.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp index 632682f6c8d..71bd9c37376 100644 --- a/cpp/tests/lists/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/lists/stream_compaction/distinct_tests.cpp @@ -23,32 +23,44 @@ #include #include +using float_type = double; +using namespace cudf::test::iterators; +// using cudf::nan_policy; +// using cudf::null_equality; +// using cudf::null_policy; + auto constexpr null{0}; // null at current level // auto constexpr XXX{0}; // null pushed down from parent level -auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); +auto constexpr neg_Inf = -std::numeric_limits::infinity(); +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr Inf = std::numeric_limits::infinity(); +auto constexpr NAN_EQUAL = cudf::nan_equality::ALL_EQUAL; +auto constexpr NAN_UNEQUAL = cudf::nan_equality::UNEQUAL; using bools_col = cudf::test::fixed_width_column_wrapper; // using int32s_col = cudf::test::fixed_width_column_wrapper; -// using floats_col = cudf::test::fixed_width_column_wrapper; -using lists_col = cudf::test::lists_column_wrapper; +using floats_col = cudf::test::fixed_width_column_wrapper; +using floats_lists = cudf::test::lists_column_wrapper; +using strings_lists = cudf::test::lists_column_wrapper; // using strings_col = cudf::test::strings_column_wrapper; // using structs_col = cudf::test::structs_column_wrapper; using lists_cv = cudf::lists_column_view; -// using cudf::nan_policy; -// using cudf::null_equality; -// using cudf::null_policy; -// using cudf::test::iterators::no_nulls; -using cudf::test::iterators::null_at; -// using cudf::test::iterators::nulls_at; - namespace { -auto distinct_sorted(cudf::column_view const& input) + +auto distinct_sorted(cudf::column_view const& input, cudf::nan_equality nans_equal = NAN_EQUAL) { - auto const results = cudf::lists::distinct(lists_cv{input}); + auto const results = + cudf::lists::distinct(lists_cv{input}, cudf::null_equality::EQUAL, nans_equal); + + // The sorted result will have nulls first and NaNs last. + // In addition, row equality comparisons in tests just ignore NaN sign thus the expected values + // can be just NaN while the input can be mixed of NaN and neg_NaN. return cudf::lists::sort_lists( lists_cv{*results}, cudf::order::ASCENDING, cudf::null_order::BEFORE); } + } // namespace struct ListDistinctTest : public cudf::test::BaseFixture { @@ -65,17 +77,55 @@ TYPED_TEST_SUITE(ListDistinctTypedTest, TestTypes); TEST_F(ListDistinctTest, TrivialTest) { - auto const input = lists_col{{lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 0.0}, null_at(6)}, - lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}, - {} /*NULL*/, - lists_col{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}}, - null_at(2)}; + auto const input = + floats_lists{{floats_lists{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 0.0}, null_at(6)}, + floats_lists{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}, + {} /*NULL*/, + floats_lists{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}}, + null_at(2)}; + auto const expected = floats_lists{{floats_lists{{null, 0.0, 5.0, NaN}, null_at(0)}, + floats_lists{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}, + floats_lists{} /*NULL*/, + floats_lists{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}}, + null_at(2)}; auto const results_sorted = distinct_sorted(input); - auto const expected = lists_col{{lists_col{{null, 0.0, 5.0, NaN}, null_at(0)}, - lists_col{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}, - lists_col{} /*NULL*/, - lists_col{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}}, - null_at(2)}; CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } + +TEST_F(ListDistinctTest, FloatingPointTestsWithSignedZero) +{ + // -0.0 and 0.0 should be considered equal. + auto const input = floats_lists{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0, 3}; + auto const expect = floats_lists{0, 1, 2, 3}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *results_sorted); +} + +TEST_F(ListDistinctTest, FloatingPointTestsWithInf) +{ + auto const input = floats_lists{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}; + auto const expected = floats_lists{neg_Inf, 0, Inf}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); +} + +TEST_F(ListDistinctTest, FloatingPointTestsWithNaNs) +{ + auto const input = + floats_lists{0, -1, 1, NaN, 2, 0, neg_NaN, 1, -2, 2, 0, 1, 2, neg_NaN, NaN, NaN, NaN, neg_NaN}; + + // NaNs are equal. + { + auto const expected = floats_lists{-2, -1, 0, 1, 2, NaN}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // NaNs are unequal. + { + auto const expected = floats_lists{-2, -1, 0, 1, 2, NaN, NaN, NaN, NaN, NaN, NaN, NaN}; + auto const results_sorted = distinct_sorted(input, NAN_UNEQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} From 3845c9560fa974879169756ef17e67c0bcd8071d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 15:43:11 -0700 Subject: [PATCH 10/32] Implement string tests --- .../stream_compaction/distinct_tests.cpp | 136 ++++++++++++++++-- 1 file changed, 123 insertions(+), 13 deletions(-) diff --git a/cpp/tests/lists/stream_compaction/distinct_tests.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp index 71bd9c37376..f76bba9b7c0 100644 --- a/cpp/tests/lists/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/lists/stream_compaction/distinct_tests.cpp @@ -31,12 +31,14 @@ using namespace cudf::test::iterators; auto constexpr null{0}; // null at current level // auto constexpr XXX{0}; // null pushed down from parent level -auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); -auto constexpr neg_Inf = -std::numeric_limits::infinity(); -auto constexpr NaN = std::numeric_limits::quiet_NaN(); -auto constexpr Inf = std::numeric_limits::infinity(); -auto constexpr NAN_EQUAL = cudf::nan_equality::ALL_EQUAL; -auto constexpr NAN_UNEQUAL = cudf::nan_equality::UNEQUAL; +auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); +auto constexpr neg_Inf = -std::numeric_limits::infinity(); +auto constexpr NaN = std::numeric_limits::quiet_NaN(); +auto constexpr Inf = std::numeric_limits::infinity(); +auto constexpr NULL_EQUAL = cudf::null_equality::EQUAL; +auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL; +auto constexpr NAN_EQUAL = cudf::nan_equality::ALL_EQUAL; +auto constexpr NAN_UNEQUAL = cudf::nan_equality::UNEQUAL; using bools_col = cudf::test::fixed_width_column_wrapper; // using int32s_col = cudf::test::fixed_width_column_wrapper; @@ -49,10 +51,11 @@ using lists_cv = cudf::lists_column_view; namespace { -auto distinct_sorted(cudf::column_view const& input, cudf::nan_equality nans_equal = NAN_EQUAL) +auto distinct_sorted(cudf::column_view const& input, + cudf::null_equality nulls_equal = NULL_EQUAL, + cudf::nan_equality nans_equal = NAN_EQUAL) { - auto const results = - cudf::lists::distinct(lists_cv{input}, cudf::null_equality::EQUAL, nans_equal); + auto const results = cudf::lists::distinct(lists_cv{input}, nulls_equal, nans_equal); // The sorted result will have nulls first and NaNs last. // In addition, row equality comparisons in tests just ignore NaN sign thus the expected values @@ -96,10 +99,10 @@ TEST_F(ListDistinctTest, TrivialTest) TEST_F(ListDistinctTest, FloatingPointTestsWithSignedZero) { // -0.0 and 0.0 should be considered equal. - auto const input = floats_lists{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0, 3}; - auto const expect = floats_lists{0, 1, 2, 3}; + auto const input = floats_lists{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0, 3}; + auto const expected = floats_lists{0, 1, 2, 3}; auto const results_sorted = distinct_sorted(input); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *results_sorted); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } TEST_F(ListDistinctTest, FloatingPointTestsWithInf) @@ -125,7 +128,114 @@ TEST_F(ListDistinctTest, FloatingPointTestsWithNaNs) // NaNs are unequal. { auto const expected = floats_lists{-2, -1, 0, 1, 2, NaN, NaN, NaN, NaN, NaN, NaN, NaN}; - auto const results_sorted = distinct_sorted(input, NAN_UNEQUAL); + auto const results_sorted = distinct_sorted(input, NULL_EQUAL, NAN_UNEQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, StringTestsNonNull) +{ + // Trivial cases - empty input. + { + auto const input = strings_lists{{}}; + auto const expected = strings_lists{{}}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // No duplicate. + { + auto const input = strings_lists{"this", "is", "a", "string"}; + auto const expected = strings_lists{"a", "is", "string", "this"}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // One list column. + { + auto const input = strings_lists{"this", "is", "is", "is", "a", "string", "string"}; + auto const expected = strings_lists{"a", "is", "string", "this"}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Multiple lists column. + { + auto const input = strings_lists{ + strings_lists{"this", "is", "a", "no duplicate", "string"}, + strings_lists{"this", "is", "is", "a", "one duplicate", "string"}, + strings_lists{"this", "is", "is", "is", "a", "two duplicates", "string"}, + strings_lists{"this", "is", "is", "is", "is", "a", "three duplicates", "string"}}; + auto const expected = + strings_lists{strings_lists{"a", "is", "no duplicate", "string", "this"}, + strings_lists{"a", "is", "one duplicate", "string", "this"}, + strings_lists{"a", "is", "string", "this", "two duplicates"}, + strings_lists{"a", "is", "string", "this", "three duplicates"}}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, StringTestsWithNullsEqual) +{ + auto const null = std::string(""); + + // One list column with null entries. + { + auto const input = strings_lists{ + {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})}; + auto const expected = strings_lists{{null, "a", "is", "string", "this"}, null_at(0)}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Multiple lists column with null lists and null entries. + { + auto const input = strings_lists{ + {strings_lists{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + nulls_at({1, 3, 5, 7})}, + strings_lists{}, /* NULL */ + strings_lists{"this", "is", "is", "a", "one duplicate", "string"}}, + null_at(1)}; + auto const expected = + strings_lists{{strings_lists{{null, "a", "is", "no duplicate", "string", "this"}, null_at(0)}, + strings_lists{}, /* NULL */ + strings_lists{"a", "is", "one duplicate", "string", "this"}}, + null_at(1)}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, StringTestsWithNullsUnequal) +{ + auto const null = std::string(""); + + // One list column with null entries. + { + auto const input = strings_lists{ + {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})}; + auto const expected = + strings_lists{{null, null, null, "a", "is", "string", "this"}, nulls_at({0, 1, 2})}; + auto const results_sorted = distinct_sorted(input, NULL_UNEQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Multiple lists column with null lists and null entries. + { + auto const input = strings_lists{ + {strings_lists{{"this", null, "is", null, "a", null, "no duplicate", null, "string"}, + nulls_at({1, 3, 5, 7})}, + strings_lists{}, /* NULL */ + strings_lists{"this", "is", "is", "a", "one duplicate", "string"}}, + null_at(1)}; + auto const expected = strings_lists{ + {strings_lists{{null, null, null, null, "a", "is", "no duplicate", "string", "this"}, + nulls_at({0, 1, 2, 3})}, + strings_lists{}, /* NULL */ + strings_lists{"a", "is", "one duplicate", "string", "this"}}, + null_at(1)}; + auto const results_sorted = distinct_sorted(input, NULL_UNEQUAL); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } } From 507c82df18d3cef7c900c88247ea0b936ba54627 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 17:46:11 -0700 Subject: [PATCH 11/32] Implement tests for `ListDistinctTypedTest` --- .../stream_compaction/distinct_tests.cpp | 117 +++++++++++++++++- 1 file changed, 115 insertions(+), 2 deletions(-) diff --git a/cpp/tests/lists/stream_compaction/distinct_tests.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp index f76bba9b7c0..16903b0a2a8 100644 --- a/cpp/tests/lists/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/lists/stream_compaction/distinct_tests.cpp @@ -73,8 +73,8 @@ template struct ListDistinctTypedTest : public cudf::test::BaseFixture { }; -using TestTypes = cudf::test:: - Concat; +using TestTypes = + cudf::test::Concat; TYPED_TEST_SUITE(ListDistinctTypedTest, TestTypes); @@ -239,3 +239,116 @@ TEST_F(ListDistinctTest, StringTestsWithNullsUnequal) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } } + +TYPED_TEST(ListDistinctTypedTest, TrivialInputTests) +{ + using lists_col = cudf::test::lists_column_wrapper; + + // Empty input. + { + auto const input = lists_col{}; + auto const expected = lists_col{}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // All input lists are empty. + { + auto const input = lists_col{lists_col{}, lists_col{}, lists_col{}}; + auto const expected = lists_col{lists_col{}, lists_col{}, lists_col{}}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Trivial cases. + { + auto const input = lists_col{0, 1, 2, 3, 4, 5}; + auto const expected = lists_col{0, 1, 2, 3, 4, 5}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Multiple empty lists. + { + auto const input = lists_col{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}; + auto const expected = lists_col{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TYPED_TEST(ListDistinctTypedTest, SlicedNonNullInputTests) +{ + using lists_col = cudf::test::lists_column_wrapper; + + auto const input_original = + lists_col{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; + + { + auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results_sorted = distinct_sorted(input_original); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + { + auto const input = cudf::slice(input_original, {0, 5})[0]; + auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + { + auto const input = cudf::slice(input_original, {1, 5})[0]; + auto const expected = lists_col{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + { + auto const input = cudf::slice(input_original, {1, 3})[0]; + auto const expected = lists_col{{1, 2, 3, 4}, {5}}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + { + auto const input = cudf::slice(input_original, {0, 3})[0]; + auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TYPED_TEST(ListDistinctTypedTest, InputHaveNullsTests) +{ + using lists_col = cudf::test::lists_column_wrapper; + auto constexpr null = TypeParam{0}; + + // Nullable lists. + { + auto const input = lists_col{ + {{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}}, nulls_at({2, 3})}; + auto const expected = lists_col{ + {{1, 2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})}; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Nullable child and nulls are equal. + { + auto const input = + lists_col{{null, 1, null, 3, null, 5, null, 7, null, 9}, nulls_at({0, 2, 4, 6, 8})}; + auto const expected = lists_col{{null, 1, 3, 5, 7, 9}, null_at(0)}; + auto const results_sorted = distinct_sorted(input, NULL_EQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } + + // Nullable child and nulls are unequal. + { + auto const input = lists_col{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})}; + auto const expected = + lists_col{{null, null, null, null, null, 1, 3, 5, 7, 9}, nulls_at({0, 1, 2, 3, 4})}; + auto const results_sorted = distinct_sorted(input, NULL_UNEQUAL); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} From 2cb8347aef8d3a422aa4d7522914ee04f41978c7 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 20:34:36 -0700 Subject: [PATCH 12/32] Complete the remaining tests --- .../stream_compaction/distinct_tests.cpp | 398 +++++++++++++++++- 1 file changed, 388 insertions(+), 10 deletions(-) diff --git a/cpp/tests/lists/stream_compaction/distinct_tests.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp index 16903b0a2a8..53cd9391d47 100644 --- a/cpp/tests/lists/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/lists/stream_compaction/distinct_tests.cpp @@ -20,17 +20,15 @@ #include #include +#include #include #include using float_type = double; using namespace cudf::test::iterators; -// using cudf::nan_policy; -// using cudf::null_equality; -// using cudf::null_policy; auto constexpr null{0}; // null at current level -// auto constexpr XXX{0}; // null pushed down from parent level +auto constexpr XXX{0}; // null pushed down from parent level auto constexpr neg_NaN = -std::numeric_limits::quiet_NaN(); auto constexpr neg_Inf = -std::numeric_limits::infinity(); auto constexpr NaN = std::numeric_limits::quiet_NaN(); @@ -40,14 +38,12 @@ auto constexpr NULL_UNEQUAL = cudf::null_equality::UNEQUAL; auto constexpr NAN_EQUAL = cudf::nan_equality::ALL_EQUAL; auto constexpr NAN_UNEQUAL = cudf::nan_equality::UNEQUAL; -using bools_col = cudf::test::fixed_width_column_wrapper; -// using int32s_col = cudf::test::fixed_width_column_wrapper; -using floats_col = cudf::test::fixed_width_column_wrapper; +using int32s_col = cudf::test::fixed_width_column_wrapper; using floats_lists = cudf::test::lists_column_wrapper; using strings_lists = cudf::test::lists_column_wrapper; -// using strings_col = cudf::test::strings_column_wrapper; -// using structs_col = cudf::test::structs_column_wrapper; -using lists_cv = cudf::lists_column_view; +using strings_col = cudf::test::strings_column_wrapper; +using structs_col = cudf::test::structs_column_wrapper; +using lists_cv = cudf::lists_column_view; namespace { @@ -352,3 +348,385 @@ TYPED_TEST(ListDistinctTypedTest, InputHaveNullsTests) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } } + +TEST_F(ListDistinctTest, InputListsOfStructsNoNull) +{ + auto const get_structs = [] { + auto child1 = int32s_col{ + 1, 1, 1, 1, 1, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, 2, 2, // list2 + 2, 2, 2, 2, 3, 2, 3, 3 // list3 + }; + auto child2 = strings_col{ + // begin list1 + "Banana", + "Mango", + "Apple", + "Cherry", + "Kiwi", + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "Cat", + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "XYZ", + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }; + return structs_col{{child1, child2}}; + }; + + auto const get_expected = [] { + auto child1 = int32s_col{1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 3, 3}; + auto child2 = strings_col{ + // begin list1 + "Apple", + "Banana", + "Cherry", + "Kiwi", + "Mango", // end list1 + // begin list2 + "Bear", + "Cat", + "Dog", + "Duck", + "Cat", + "Panda", // end list2 + // begin list3 + "ÁBC", + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "XYZ", + "ÁBC" // end list3 + }; + return structs_col{{child1, child2}}; + }; + + // Test full columns. + { + auto const input = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, int32s_col{0, 5, 11, 17}.release(), get_expected().release(), 0, {}); + auto const results_sorted = distinct_sorted(*input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted); + } + + // Test sliced columns. + { + auto const input_original = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, int32s_col{0, 5, 11, 17}.release(), get_expected().release(), 0, {}); + auto const input = cudf::slice(*input_original, {1, 3})[0]; + auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, InputListsOfStructsHaveNull) +{ + auto const get_structs = [] { + auto child1 = int32s_col{{ + 1, 1, null, XXX, XXX, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, null, 2, // list2 + null, null, 2, 2, 3, 2, 3, 3 // list3 + }, + nulls_at({2, 14, 16, 17})}; + auto child2 = strings_col{{ + // begin list1 + "Banana", + "Mango", + "Apple", + "XXX", /*NULL*/ + "XXX", /*NULL*/ + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "" /*NULL*/, + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }, + nulls_at({14, 20})}; + return structs_col{{child1, child2}, nulls_at({3, 4})}; + }; + + auto const get_expected = [] { + auto child1 = int32s_col{{ // begin list1 + XXX, // end list1 + null, + 1, + 1, + 1, + 1, + // begin list2 + null, // end list2 + 1, + 1, + 1, + 1, + 2, + // begin list3 + null, + null, + 2, + 2, + 2, + 3, + 3, + 3}, // end list3 + nulls_at({1, 6, 12, 13})}; + auto child2 = strings_col{{ // begin list1 + "XXX", /*NULL*/ + "Apple", + "Banana", + "Cherry", + "Kiwi", + "Mango", // end list1 + // begin list2 + "", /*NULL*/ + "Bear", + "Cat", + "Dog", + "Duck", + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÁBC", + "ÁÁÁ", + "ÍÍÍÍÍ", + "", /*NULL*/ + "XYZ", + "ÁBC"}, // end list3 + nulls_at({6, 17})}; + return structs_col{{child1, child2}, null_at(0)}; + }; + + // Test full columns. + { + auto const input = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, int32s_col{0, 6, 12, 20}.release(), get_expected().release(), 0, {}); + auto const results_sorted = distinct_sorted(*input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted); + } + + // Test sliced columns. + { + auto const input_original = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, int32s_col{0, 6, 12, 20}.release(), get_expected().release(), 0, {}); + auto const input = cudf::slice(*input_original, {1, 3})[0]; + auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, InputListsOfNestedStructsHaveNull) +{ + auto const get_structs = [] { + auto grandchild1 = int32s_col{{ + 1, XXX, null, XXX, XXX, 1, 1, 1, // list1 + 1, 1, 1, 1, 2, 1, null, 2, // list2 + null, null, 2, 2, 3, 2, 3, 3 // list3 + }, + nulls_at({2, 14, 16, 17})}; + auto grandchild2 = strings_col{{ + // begin list1 + "Banana", + "YYY", /*NULL*/ + "Apple", + "XXX", /*NULL*/ + "YYY", /*NULL*/ + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "Bear", + "Duck", + "Cat", + "Dog", + "Panda", + "Bear", + "" /*NULL*/, + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÍÍÍÍÍ", + "ÁBC", + "" /*NULL*/, + "ÁÁÁ", + "ÁBC", + "XYZ" // end list3 + }, + nulls_at({14, 20})}; + auto child1 = structs_col{{grandchild1, grandchild2}, nulls_at({1, 3, 4})}; + return structs_col{{child1}}; + }; + + auto const get_expected = [] { + auto grandchild1 = int32s_col{{// begin list1 + XXX, + null, + 1, + 1, + 1, // end list1 + // begin list2 + null, + 1, + 1, + 1, + 1, + 2, // end list2 + // begin list3 + null, + null, + 2, + 2, + 2, + 3, + 3, + 3}, + nulls_at({1, 5, 11, 12})}; + auto grandchild2 = strings_col{{ + // begin list1 + "XXX" /*NULL*/, + "Apple", + "Banana", + "Cherry", + "Kiwi", // end list1 + // begin list2 + "" /*NULL*/, + "Bear", + "Cat", + "Dog", + "Duck", + "Panda", // end list2 + // begin list3 + "ÁÁÁ", + "ÉÉÉÉÉ", + "ÁBC", + "ÁÁÁ", + "ÍÍÍÍÍ", + "", /*NULL*/ + "XYZ", + "ÁBC" // end list3 + }, + nulls_at({5, 16})}; + auto child1 = structs_col{{grandchild1, grandchild2}, nulls_at({0})}; + return structs_col{{child1}}; + }; + + // Test full columns. + { + auto const input = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected = cudf::make_lists_column( + 3, int32s_col{0, 5, 11, 19}.release(), get_expected().release(), 0, {}); + auto const results_sorted = distinct_sorted(*input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted); + } + + // Test sliced columns. + { + auto const input_original = cudf::make_lists_column( + 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); + auto const expected_original = cudf::make_lists_column( + 3, int32s_col{0, 5, 11, 19}.release(), get_expected().release(), 0, {}); + auto const input = cudf::slice(*input_original, {1, 3})[0]; + auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + auto const results_sorted = distinct_sorted(input); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); + } +} + +TEST_F(ListDistinctTest, InputListsOfStructsOfLists) +{ + auto const input = [] { + auto const get_structs = [] { + auto child1 = int32s_col{// begin list1 + 0, + 0, + 0, // end list1 + // begin list2 + 2, // end list2 + // begin list3 + 3, + 3, + 3}; + auto child2 = floats_lists{// begin list1 + floats_lists{0, 1}, + floats_lists{0, 1}, + floats_lists{0, 1}, // end list1 + // begin list2 + floats_lists{3, 4, 5}, // end list2 + // begin list3 + floats_lists{6, 7}, + floats_lists{6, 7}, + floats_lists{6, 7}}; + return structs_col{{child1, child2}}; + }; + + return cudf::make_lists_column( + 3, int32s_col{0, 3, 4, 7}.release(), get_structs().release(), 0, {}); + }(); + + auto const expected = [] { + auto const get_structs = [] { + auto child1 = int32s_col{ // begin list1 + 0, // end list1 + // begin list2 + 2, // end list2 + // begin list3 + 3}; + auto child2 = floats_lists{ // begin list1 + floats_lists{0, 1}, // end list1 + // begin list2 + floats_lists{3, 4, 5}, // end list2 + // begin list3 + floats_lists{6, 7}}; + return structs_col{{child1, child2}}; + }; + + return cudf::make_lists_column( + 3, int32s_col{0, 1, 2, 3}.release(), get_structs().release(), 0, {}); + }(); + + auto const results = cudf::lists::distinct(lists_cv{*input}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results); +} From 43886372ee6ec601895e5134ea383aba9a9e66ba Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 27 Jun 2022 20:37:34 -0700 Subject: [PATCH 13/32] Rewrite doxygen --- cpp/include/cudf/lists/stream_compaction.hpp | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp index 69f94a2c4bf..551a2ae4dcd 100644 --- a/cpp/include/cudf/lists/stream_compaction.hpp +++ b/cpp/include/cudf/lists/stream_compaction.hpp @@ -56,16 +56,17 @@ std::unique_ptr apply_boolean_mask( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief + * @brief Create a new list column by copying elements from the input lists column ignoring + * duplicate list elements. * - * - * A null input row in any of the input lists columns will result in a null output row. The order of - * elements within each list is not preserved. + * Given a lists column, an output lists column is generated by copying elements from the input + * lists column in a way such that the duplicate elements in each list are ignored, producing only + * distinct list elements. * * Example: * @code{.pseudo} - * input = { {0, 1, 2, 3, 2}, {1, 2, 3}, null, {4, null, null, 5} } - * result = { {0, 1, 2, 3}, {1, 2, 3}, null, {4, null, 5} } + * input = { {0, 1, 2, 3, 2}, {3, 1, 2}, null, {4, null, null, 5} } + * result = { {0, 1, 2, 3}, {3, 1, 2}, null, {4, null, 5} } * @endcode * * @param input The input lists column From 4dd5e7447fc866c9456ae582a182e86fea7319de Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 28 Jun 2022 06:49:22 -0700 Subject: [PATCH 14/32] Misc --- cpp/src/lists/utilities.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/lists/utilities.cu b/cpp/src/lists/utilities.cu index fb38e3711e5..95582ad5715 100644 --- a/cpp/src/lists/utilities.cu +++ b/cpp/src/lists/utilities.cu @@ -43,7 +43,7 @@ std::unique_ptr reconstruct_offsets(column_view const& labels, data_type{type_to_id()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr); auto const labels_begin = labels.template begin(); - auto const offsets_begin = out_offsets->mutable_view().template begin(); + auto const offsets_begin = out_offsets->mutable_view().template begin(); cudf::detail::labels_to_offsets(labels_begin, labels_begin + labels.size(), offsets_begin, From 3b0760c4c2c8956752cdb6bd66eaa2abaf30c1ae Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 28 Jun 2022 09:19:55 -0700 Subject: [PATCH 15/32] Misc --- cpp/src/lists/stream_compaction/distinct.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu index 3b71ef01fd5..4874eaf2211 100644 --- a/cpp/src/lists/stream_compaction/distinct.cu +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -37,7 +37,7 @@ std::unique_ptr distinct(lists_column_view const& input, { // Algorithm: // - Generate labels for the child elements. - // - Get distinct rows of the table {labels, child} using `cudf::stable_distinct`. + // - Get distinct rows of the table {labels, child} using `stable_distinct`. // - Build the output lists column from the output distinct rows above. auto const child = input.get_sliced_child(stream); From 9730b70c9d5377d3e7140c65434f1d4a7938e705 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 28 Jun 2022 09:27:33 -0700 Subject: [PATCH 16/32] Rewrite test --- .../stream_compaction/distinct_tests.cpp | 27 +++++++++---------- 1 file changed, 12 insertions(+), 15 deletions(-) diff --git a/cpp/tests/lists/stream_compaction/distinct_tests.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp index 53cd9391d47..56d788edf45 100644 --- a/cpp/tests/lists/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/lists/stream_compaction/distinct_tests.cpp @@ -684,8 +684,11 @@ TEST_F(ListDistinctTest, InputListsOfStructsOfLists) 0, 0, // end list1 // begin list2 - 2, // end list2 + 1, // end list2 // begin list3 + 2, + 2, // end list3 + // begin list4 3, 3, 3}; @@ -696,6 +699,9 @@ TEST_F(ListDistinctTest, InputListsOfStructsOfLists) // begin list2 floats_lists{3, 4, 5}, // end list2 // begin list3 + floats_lists{}, + floats_lists{}, // end list3 + // begin list4 floats_lists{6, 7}, floats_lists{6, 7}, floats_lists{6, 7}}; @@ -703,28 +709,19 @@ TEST_F(ListDistinctTest, InputListsOfStructsOfLists) }; return cudf::make_lists_column( - 3, int32s_col{0, 3, 4, 7}.release(), get_structs().release(), 0, {}); + 4, int32s_col{0, 3, 4, 6, 9}.release(), get_structs().release(), 0, {}); }(); auto const expected = [] { auto const get_structs = [] { - auto child1 = int32s_col{ // begin list1 - 0, // end list1 - // begin list2 - 2, // end list2 - // begin list3 - 3}; - auto child2 = floats_lists{ // begin list1 - floats_lists{0, 1}, // end list1 - // begin list2 - floats_lists{3, 4, 5}, // end list2 - // begin list3 - floats_lists{6, 7}}; + auto child1 = int32s_col{0, 1, 2, 3}; + auto child2 = + floats_lists{floats_lists{0, 1}, floats_lists{3, 4, 5}, floats_lists{}, floats_lists{6, 7}}; return structs_col{{child1, child2}}; }; return cudf::make_lists_column( - 3, int32s_col{0, 1, 2, 3}.release(), get_structs().release(), 0, {}); + 4, int32s_col{0, 1, 2, 3, 4}.release(), get_structs().release(), 0, {}); }(); auto const results = cudf::lists::distinct(lists_cv{*input}); From 9bd9b6f62c56899f0bd4c66d08df65d69e311dd4 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 28 Jun 2022 09:32:38 -0700 Subject: [PATCH 17/32] Fix doxygen --- cpp/include/cudf/detail/stream_compaction.hpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 955744042a3..24d0135dd80 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -100,7 +100,7 @@ std::unique_ptr
distinct( * This API produces exactly the same set of output rows as `cudf::distinct`. * * @param input The input table - * @param keep Get index of any, first, last, or none of the found duplicates + * @param keep Copy any, first, last, or none of the found duplicates * @param nulls_equal Flag to specify whether null elements should be considered as equal * @param nans_equal Flag to specify whether NaN elements should be considered as equal * @param stream CUDA stream used for device memory operations and kernel launches From 790a482a55637966a42549443c1451a304af09b6 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 28 Jun 2022 09:35:45 -0700 Subject: [PATCH 18/32] Fix header --- cpp/src/lists/utilities.hpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp index 1ca48665a29..3c58aa0bfab 100644 --- a/cpp/src/lists/utilities.hpp +++ b/cpp/src/lists/utilities.hpp @@ -16,8 +16,6 @@ #pragma once -#include "utilities.hpp" - #include #include From 1c58baace3c8a062700061d10f0bff424ec9ff9e Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 28 Jun 2022 09:39:33 -0700 Subject: [PATCH 19/32] Rewrite doxygen --- cpp/src/lists/utilities.hpp | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp index 3c58aa0bfab..248f9ae061e 100644 --- a/cpp/src/lists/utilities.hpp +++ b/cpp/src/lists/utilities.hpp @@ -27,18 +27,19 @@ namespace cudf::lists::detail { * @brief Generate list labels for elements in the child column of the input lists column. * * @param input The input lists column + * @param n_elements The number of elements in the child column of the input lists column * @param stream CUDA stream used for device memory operations and kernel launches - * @return A column containing list labels corresponding to each input list elements + * @return A column containing list labels corresponding to each element in the child column */ std::unique_ptr generate_labels(lists_column_view const& input, size_type n_elements, rmm::cuda_stream_view stream); /** - * @brief Reconstruct an offsets column from the input labels array. + * @brief Reconstruct an offsets column from the input list labels column. * - * @param labels The list labels corresponding to each input list elements - * @param n_lists The number of lists in the input lists column + * @param labels The list labels corresponding to each list element + * @param n_lists The number of lists to build the offsets column * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned object * @return The output offsets column From d493c4fa048b815b8d1eafbd9c0b3616c9b4244b Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Tue, 28 Jun 2022 11:26:09 -0700 Subject: [PATCH 20/32] Rewrite doxygen and fix headers --- cpp/include/cudf/detail/stream_compaction.hpp | 3 ++- cpp/include/cudf/lists/stream_compaction.hpp | 2 +- cpp/src/lists/stream_compaction/distinct.cu | 3 +++ cpp/src/lists/utilities.hpp | 1 + 4 files changed, 7 insertions(+), 2 deletions(-) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 24d0135dd80..2f19eea8088 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -100,12 +100,13 @@ std::unique_ptr
distinct( * This API produces exactly the same set of output rows as `cudf::distinct`. * * @param input The input table + * @param keys Vector of indices indicating key columns in the `input` table * @param keep Copy any, first, last, or none of the found duplicates * @param nulls_equal Flag to specify whether null elements should be considered as equal * @param nans_equal Flag to specify whether NaN elements should be considered as equal * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned table - * @return A device_uvector containing the result indices + * @return A table containing the resulting distinct rows */ std::unique_ptr
stable_distinct( table_view const& input, diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp index 551a2ae4dcd..68cdeec606b 100644 --- a/cpp/include/cudf/lists/stream_compaction.hpp +++ b/cpp/include/cudf/lists/stream_compaction.hpp @@ -73,7 +73,7 @@ std::unique_ptr apply_boolean_mask( * @param nulls_equal Flag to specify whether null elements should be considered as equal * @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal * @param mr Device memory resource used to allocate the returned object - * @return A column of type BOOL containing the check result + * @return The resulting lists column containing lists without duplicates */ std::unique_ptr distinct( lists_column_view const& input, diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu index 4874eaf2211..d17c1897f34 100644 --- a/cpp/src/lists/stream_compaction/distinct.cu +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -26,6 +26,9 @@ #include +#include +#include + namespace cudf::lists { namespace detail { diff --git a/cpp/src/lists/utilities.hpp b/cpp/src/lists/utilities.hpp index 248f9ae061e..aab7ec1ad81 100644 --- a/cpp/src/lists/utilities.hpp +++ b/cpp/src/lists/utilities.hpp @@ -20,6 +20,7 @@ #include #include +#include namespace cudf::lists::detail { From d090d2a2532daea886cf4c7b01f2fcd20a10df81 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 30 Jun 2022 09:05:25 -0700 Subject: [PATCH 21/32] Fix iterator type --- cpp/src/stream_compaction/stable_distinct.cu | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/cpp/src/stream_compaction/stable_distinct.cu b/cpp/src/stream_compaction/stable_distinct.cu index b863e9fd6ea..dc80a454777 100644 --- a/cpp/src/stream_compaction/stable_distinct.cu +++ b/cpp/src/stream_compaction/stable_distinct.cu @@ -47,8 +47,8 @@ std::unique_ptr
stable_distinct(table_view const& input, thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false); thrust::scatter( rmm::exec_policy(stream), - thrust::constant_iterator(true, 0), - thrust::constant_iterator(true, static_cast(distinct_indices.size())), + thrust::constant_iterator(true, 0), + thrust::constant_iterator(true, static_cast(distinct_indices.size())), distinct_indices.begin(), markers.begin()); return markers; @@ -57,7 +57,7 @@ std::unique_ptr
stable_distinct(table_view const& input, return cudf::detail::copy_if( input, [output_markers = output_markers.begin()] __device__(auto const idx) { - return output_markers[idx]; + return *(output_markers + idx); }, stream, mr); From ee5182247ee952f232fb35d780c99587eb4b98af Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 30 Jun 2022 09:05:59 -0700 Subject: [PATCH 22/32] Rewrite doxygen --- cpp/include/cudf/detail/stream_compaction.hpp | 10 ++++++---- cpp/include/cudf/lists/stream_compaction.hpp | 9 ++++----- 2 files changed, 10 insertions(+), 9 deletions(-) diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp index 2f19eea8088..0db929c523c 100644 --- a/cpp/include/cudf/detail/stream_compaction.hpp +++ b/cpp/include/cudf/detail/stream_compaction.hpp @@ -91,11 +91,13 @@ std::unique_ptr
distinct( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Create a new table without duplicate rows with the original row order preserved. + * @brief Create a new table without duplicate rows. * - * Given an `input` table_view, each row is copied to the output table (with row order preserved) to - * create a set of distinct rows. If there are duplicate rows, which row to be copied depends on the - * specified value of the `keep` parameter. + * Given an `input` table_view, each row is copied to the output table to create a set of distinct + * rows. The row order is guaranteed to be preserved as in the input. + * + * If there are duplicate rows, which row to be copied depends on the specified value of the `keep` + * parameter. * * This API produces exactly the same set of output rows as `cudf::distinct`. * diff --git a/cpp/include/cudf/lists/stream_compaction.hpp b/cpp/include/cudf/lists/stream_compaction.hpp index 68cdeec606b..26d3846ab3d 100644 --- a/cpp/include/cudf/lists/stream_compaction.hpp +++ b/cpp/include/cudf/lists/stream_compaction.hpp @@ -56,12 +56,11 @@ std::unique_ptr apply_boolean_mask( rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** - * @brief Create a new list column by copying elements from the input lists column ignoring - * duplicate list elements. + * @brief Create a new list column without duplicate elements in each list. * - * Given a lists column, an output lists column is generated by copying elements from the input - * lists column in a way such that the duplicate elements in each list are ignored, producing only - * distinct list elements. + * Given a lists column `input`, distinct elements of each list are copied to the corresponding + * output list. The order of lists is preserved while the order of elements within each list is not + * guaranteed. * * Example: * @code{.pseudo} From ccdd6f08a8afea0572ae0617214a7b91bb387973 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 30 Jun 2022 09:06:15 -0700 Subject: [PATCH 23/32] Add empty lines --- .../stream_compaction/distinct_tests.cpp | 103 ++++++++++++------ 1 file changed, 67 insertions(+), 36 deletions(-) diff --git a/cpp/tests/lists/stream_compaction/distinct_tests.cpp b/cpp/tests/lists/stream_compaction/distinct_tests.cpp index 56d788edf45..93a72cfb9ce 100644 --- a/cpp/tests/lists/stream_compaction/distinct_tests.cpp +++ b/cpp/tests/lists/stream_compaction/distinct_tests.cpp @@ -83,11 +83,12 @@ TEST_F(ListDistinctTest, TrivialTest) floats_lists{{NaN, 5.0, 0.0, 0.0, 0.0, 0.0, null, 1.0}, null_at(6)}}, null_at(2)}; - auto const expected = floats_lists{{floats_lists{{null, 0.0, 5.0, NaN}, null_at(0)}, + auto const expected = floats_lists{{floats_lists{{null, 0.0, 5.0, NaN}, null_at(0)}, floats_lists{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}, floats_lists{} /*NULL*/, floats_lists{{null, 0.0, 1.0, 5.0, NaN}, null_at(0)}}, null_at(2)}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -97,14 +98,16 @@ TEST_F(ListDistinctTest, FloatingPointTestsWithSignedZero) // -0.0 and 0.0 should be considered equal. auto const input = floats_lists{0.0, 1, 2, -0.0, 1, 2, 0.0, 1, 2, -0.0, -0.0, 0.0, 0.0, 3}; auto const expected = floats_lists{0, 1, 2, 3}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } TEST_F(ListDistinctTest, FloatingPointTestsWithInf) { - auto const input = floats_lists{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}; - auto const expected = floats_lists{neg_Inf, 0, Inf}; + auto const input = floats_lists{Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf, 0, Inf, 0, neg_Inf}; + auto const expected = floats_lists{neg_Inf, 0, Inf}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -116,14 +119,16 @@ TEST_F(ListDistinctTest, FloatingPointTestsWithNaNs) // NaNs are equal. { - auto const expected = floats_lists{-2, -1, 0, 1, 2, NaN}; + auto const expected = floats_lists{-2, -1, 0, 1, 2, NaN}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } // NaNs are unequal. { - auto const expected = floats_lists{-2, -1, 0, 1, 2, NaN, NaN, NaN, NaN, NaN, NaN, NaN}; + auto const expected = floats_lists{-2, -1, 0, 1, 2, NaN, NaN, NaN, NaN, NaN, NaN, NaN}; + auto const results_sorted = distinct_sorted(input, NULL_EQUAL, NAN_UNEQUAL); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -133,24 +138,27 @@ TEST_F(ListDistinctTest, StringTestsNonNull) { // Trivial cases - empty input. { - auto const input = strings_lists{{}}; - auto const expected = strings_lists{{}}; + auto const input = strings_lists{{}}; + auto const expected = strings_lists{{}}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } // No duplicate. { - auto const input = strings_lists{"this", "is", "a", "string"}; - auto const expected = strings_lists{"a", "is", "string", "this"}; + auto const input = strings_lists{"this", "is", "a", "string"}; + auto const expected = strings_lists{"a", "is", "string", "this"}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } // One list column. { - auto const input = strings_lists{"this", "is", "is", "is", "a", "string", "string"}; - auto const expected = strings_lists{"a", "is", "string", "this"}; + auto const input = strings_lists{"this", "is", "is", "is", "a", "string", "string"}; + auto const expected = strings_lists{"a", "is", "string", "this"}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -167,6 +175,7 @@ TEST_F(ListDistinctTest, StringTestsNonNull) strings_lists{"a", "is", "one duplicate", "string", "this"}, strings_lists{"a", "is", "string", "this", "two duplicates"}, strings_lists{"a", "is", "string", "this", "three duplicates"}}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -180,7 +189,8 @@ TEST_F(ListDistinctTest, StringTestsWithNullsEqual) { auto const input = strings_lists{ {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})}; - auto const expected = strings_lists{{null, "a", "is", "string", "this"}, null_at(0)}; + auto const expected = strings_lists{{null, "a", "is", "string", "this"}, null_at(0)}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -198,6 +208,7 @@ TEST_F(ListDistinctTest, StringTestsWithNullsEqual) strings_lists{}, /* NULL */ strings_lists{"a", "is", "one duplicate", "string", "this"}}, null_at(1)}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -213,6 +224,7 @@ TEST_F(ListDistinctTest, StringTestsWithNullsUnequal) {"this", null, "is", "is", "is", "a", null, "string", null, "string"}, nulls_at({1, 6, 8})}; auto const expected = strings_lists{{null, null, null, "a", "is", "string", "this"}, nulls_at({0, 1, 2})}; + auto const results_sorted = distinct_sorted(input, NULL_UNEQUAL); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -231,6 +243,7 @@ TEST_F(ListDistinctTest, StringTestsWithNullsUnequal) strings_lists{}, /* NULL */ strings_lists{"a", "is", "one duplicate", "string", "this"}}, null_at(1)}; + auto const results_sorted = distinct_sorted(input, NULL_UNEQUAL); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -242,32 +255,36 @@ TYPED_TEST(ListDistinctTypedTest, TrivialInputTests) // Empty input. { - auto const input = lists_col{}; - auto const expected = lists_col{}; + auto const input = lists_col{}; + auto const expected = lists_col{}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } // All input lists are empty. { - auto const input = lists_col{lists_col{}, lists_col{}, lists_col{}}; - auto const expected = lists_col{lists_col{}, lists_col{}, lists_col{}}; + auto const input = lists_col{lists_col{}, lists_col{}, lists_col{}}; + auto const expected = lists_col{lists_col{}, lists_col{}, lists_col{}}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } // Trivial cases. { - auto const input = lists_col{0, 1, 2, 3, 4, 5}; - auto const expected = lists_col{0, 1, 2, 3, 4, 5}; + auto const input = lists_col{0, 1, 2, 3, 4, 5}; + auto const expected = lists_col{0, 1, 2, 3, 4, 5}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } // Multiple empty lists. { - auto const input = lists_col{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}; - auto const expected = lists_col{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}; + auto const input = lists_col{{}, {}, {5, 4, 3, 2, 1, 0}, {}, {6}, {}}; + auto const expected = lists_col{{}, {}, {0, 1, 2, 3, 4, 5}, {}, {6}, {}}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -281,35 +298,40 @@ TYPED_TEST(ListDistinctTypedTest, SlicedNonNullInputTests) lists_col{{1, 2, 3, 2, 3, 2, 3, 2, 3}, {3, 2, 1, 4, 1}, {5}, {10, 8, 9}, {6, 7}}; { - auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results_sorted = distinct_sorted(input_original); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } { - auto const input = cudf::slice(input_original, {0, 5})[0]; - auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const input = cudf::slice(input_original, {0, 5})[0]; + auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } { - auto const input = cudf::slice(input_original, {1, 5})[0]; - auto const expected = lists_col{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const input = cudf::slice(input_original, {1, 5})[0]; + auto const expected = lists_col{{1, 2, 3, 4}, {5}, {8, 9, 10}, {6, 7}}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } { - auto const input = cudf::slice(input_original, {1, 3})[0]; - auto const expected = lists_col{{1, 2, 3, 4}, {5}}; + auto const input = cudf::slice(input_original, {1, 3})[0]; + auto const expected = lists_col{{1, 2, 3, 4}, {5}}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } { - auto const input = cudf::slice(input_original, {0, 3})[0]; - auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}}; + auto const input = cudf::slice(input_original, {0, 3})[0]; + auto const expected = lists_col{{1, 2, 3}, {1, 2, 3, 4}, {5}}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -326,6 +348,7 @@ TYPED_TEST(ListDistinctTypedTest, InputHaveNullsTests) {{3, 2, 1, 4, 1}, {5}, {} /*NULL*/, {} /*NULL*/, {10, 8, 9}, {6, 7}}, nulls_at({2, 3})}; auto const expected = lists_col{ {{1, 2, 3, 4}, {5}, {} /*NULL*/, {} /*NULL*/, {8, 9, 10}, {6, 7}}, nulls_at({2, 3})}; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -334,7 +357,8 @@ TYPED_TEST(ListDistinctTypedTest, InputHaveNullsTests) { auto const input = lists_col{{null, 1, null, 3, null, 5, null, 7, null, 9}, nulls_at({0, 2, 4, 6, 8})}; - auto const expected = lists_col{{null, 1, 3, 5, 7, 9}, null_at(0)}; + auto const expected = lists_col{{null, 1, 3, 5, 7, 9}, null_at(0)}; + auto const results_sorted = distinct_sorted(input, NULL_EQUAL); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -344,6 +368,7 @@ TYPED_TEST(ListDistinctTypedTest, InputHaveNullsTests) auto const input = lists_col{{0, 1, 2, 3, 4, 5, 6, 7, 8, 9}, nulls_at({0, 2, 4, 6, 8})}; auto const expected = lists_col{{null, null, null, null, null, 1, 3, 5, 7, 9}, nulls_at({0, 1, 2, 3, 4})}; + auto const results_sorted = distinct_sorted(input, NULL_UNEQUAL); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -422,6 +447,7 @@ TEST_F(ListDistinctTest, InputListsOfStructsNoNull) 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); auto const expected = cudf::make_lists_column( 3, int32s_col{0, 5, 11, 17}.release(), get_expected().release(), 0, {}); + auto const results_sorted = distinct_sorted(*input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted); } @@ -432,8 +458,9 @@ TEST_F(ListDistinctTest, InputListsOfStructsNoNull) 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); auto const expected_original = cudf::make_lists_column( 3, int32s_col{0, 5, 11, 17}.release(), get_expected().release(), 0, {}); - auto const input = cudf::slice(*input_original, {1, 3})[0]; - auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + auto const input = cudf::slice(*input_original, {1, 3})[0]; + auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -539,6 +566,7 @@ TEST_F(ListDistinctTest, InputListsOfStructsHaveNull) 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); auto const expected = cudf::make_lists_column( 3, int32s_col{0, 6, 12, 20}.release(), get_expected().release(), 0, {}); + auto const results_sorted = distinct_sorted(*input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted); } @@ -549,8 +577,9 @@ TEST_F(ListDistinctTest, InputListsOfStructsHaveNull) 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); auto const expected_original = cudf::make_lists_column( 3, int32s_col{0, 6, 12, 20}.release(), get_expected().release(), 0, {}); - auto const input = cudf::slice(*input_original, {1, 3})[0]; - auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + auto const input = cudf::slice(*input_original, {1, 3})[0]; + auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } @@ -658,6 +687,7 @@ TEST_F(ListDistinctTest, InputListsOfNestedStructsHaveNull) 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); auto const expected = cudf::make_lists_column( 3, int32s_col{0, 5, 11, 19}.release(), get_expected().release(), 0, {}); + auto const results_sorted = distinct_sorted(*input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *results_sorted); } @@ -668,8 +698,9 @@ TEST_F(ListDistinctTest, InputListsOfNestedStructsHaveNull) 3, int32s_col{0, 8, 16, 24}.release(), get_structs().release(), 0, {}); auto const expected_original = cudf::make_lists_column( 3, int32s_col{0, 5, 11, 19}.release(), get_expected().release(), 0, {}); - auto const input = cudf::slice(*input_original, {1, 3})[0]; - auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + auto const input = cudf::slice(*input_original, {1, 3})[0]; + auto const expected = cudf::slice(*expected_original, {1, 3})[0]; + auto const results_sorted = distinct_sorted(input); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results_sorted); } From b1231a27445409e9d73e91d342cf895d58be4cf1 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 30 Jun 2022 10:04:07 -0700 Subject: [PATCH 24/32] Update default stream --- cpp/src/lists/stream_compaction/distinct.cu | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu index d17c1897f34..78333e9665d 100644 --- a/cpp/src/lists/stream_compaction/distinct.cu +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -75,7 +75,7 @@ std::unique_ptr distinct(lists_column_view const& input, rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); - return detail::distinct(input, nulls_equal, nans_equal, rmm::cuda_stream_default, mr); + return detail::distinct(input, nulls_equal, nans_equal, cudf::default_stream_value, mr); } } // namespace cudf::lists From 99d70b136dfff7d15c8a34fc8ab8b7494d86fe59 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Thu, 7 Jul 2022 20:38:12 -0700 Subject: [PATCH 25/32] Handle empty input --- cpp/src/lists/stream_compaction/distinct.cu | 3 +++ 1 file changed, 3 insertions(+) diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu index 78333e9665d..c88209292de 100644 --- a/cpp/src/lists/stream_compaction/distinct.cu +++ b/cpp/src/lists/stream_compaction/distinct.cu @@ -17,6 +17,7 @@ #include #include +#include #include #include #include @@ -43,6 +44,8 @@ std::unique_ptr distinct(lists_column_view const& input, // - Get distinct rows of the table {labels, child} using `stable_distinct`. // - Build the output lists column from the output distinct rows above. + if (input.is_empty()) { return empty_like(input.parent()); } + auto const child = input.get_sliced_child(stream); auto const labels = generate_labels(input, child.size(), stream); From 717118ef267185a8467d539d103a472d2c0db430 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Fri, 8 Jul 2022 16:20:35 -0700 Subject: [PATCH 26/32] Use `lists::distinct` --- ...{drop_list_duplicates.pxd => stream_compaction.pxd} | 4 ++-- python/cudf/cudf/_lib/lists.pyx | 10 +++++----- python/cudf/cudf/core/column/lists.py | 4 ++-- 3 files changed, 9 insertions(+), 9 deletions(-) rename python/cudf/cudf/_lib/cpp/lists/{drop_list_duplicates.pxd => stream_compaction.pxd} (80%) diff --git a/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd b/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd similarity index 80% rename from python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd rename to python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd index 81d54104320..c4f31d54a03 100644 --- a/python/cudf/cudf/_lib/cpp/lists/drop_list_duplicates.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd @@ -7,9 +7,9 @@ from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.types cimport nan_equality, null_equality -cdef extern from "cudf/lists/drop_list_duplicates.hpp" \ +cdef extern from "cudf/lists/stream_compaction.hpp" \ namespace "cudf::lists" nogil: - cdef unique_ptr[column] drop_list_duplicates( + cdef unique_ptr[column] distinct( const lists_column_view lists_column, null_equality nulls_equal, nan_equality nans_equal diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 025fb0665d3..0a06cc2a22c 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -15,8 +15,8 @@ from cudf._lib.cpp.lists.combine cimport ( from cudf._lib.cpp.lists.count_elements cimport ( count_elements as cpp_count_elements, ) -from cudf._lib.cpp.lists.drop_list_duplicates cimport ( - drop_list_duplicates as cpp_drop_list_duplicates, +from cudf._lib.cpp.lists.distinct cimport ( + distinct as cpp_distinct, ) from cudf._lib.cpp.lists.explode cimport explode_outer as cpp_explode_outer from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view @@ -96,9 +96,9 @@ def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal): with nogil: c_result = move( - cpp_drop_list_duplicates(list_view.get()[0], - c_nulls_equal, - c_nans_equal) + cpp_distinct(list_view.get()[0], + c_nulls_equal, + c_nans_equal) ) return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index e8a5638f07a..d72e219e1e1 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -13,7 +13,7 @@ concatenate_rows, contains_scalar, count_elements, - drop_list_duplicates, + distinct, extract_element_column, extract_element_scalar, index_of_column, @@ -603,7 +603,7 @@ def unique(self) -> ParentType: raise NotImplementedError("Nested lists unique is not supported.") return self._return_or_inplace( - drop_list_duplicates( + distinct( self._column, nulls_equal=True, nans_all_equal=True ) ) From 24b7d0f40cb550078861df7ff735ec02a1434219 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 10 Jul 2022 13:19:26 -0700 Subject: [PATCH 27/32] Fix style --- python/cudf/cudf/_lib/lists.pyx | 4 +--- python/cudf/cudf/core/column/lists.py | 4 +--- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 0a06cc2a22c..130f4d48cf2 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -15,9 +15,7 @@ from cudf._lib.cpp.lists.combine cimport ( from cudf._lib.cpp.lists.count_elements cimport ( count_elements as cpp_count_elements, ) -from cudf._lib.cpp.lists.distinct cimport ( - distinct as cpp_distinct, -) +from cudf._lib.cpp.lists.distinct cimport distinct as cpp_distinct from cudf._lib.cpp.lists.explode cimport explode_outer as cpp_explode_outer from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.lists.sorting cimport sort_lists as cpp_sort_lists diff --git a/python/cudf/cudf/core/column/lists.py b/python/cudf/cudf/core/column/lists.py index d72e219e1e1..c6a19f374bd 100644 --- a/python/cudf/cudf/core/column/lists.py +++ b/python/cudf/cudf/core/column/lists.py @@ -603,9 +603,7 @@ def unique(self) -> ParentType: raise NotImplementedError("Nested lists unique is not supported.") return self._return_or_inplace( - distinct( - self._column, nulls_equal=True, nans_all_equal=True - ) + distinct(self._column, nulls_equal=True, nans_all_equal=True) ) def sort_values( From cfc7b9a34affbbe6442cae12eca7b45561b090f3 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 10 Jul 2022 13:28:50 -0700 Subject: [PATCH 28/32] Update copyright year --- python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd b/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd index c4f31d54a03..58c1ab1dcec 100644 --- a/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd +++ b/python/cudf/cudf/_lib/cpp/lists/stream_compaction.pxd @@ -1,4 +1,4 @@ -# Copyright (c) 2021, NVIDIA CORPORATION. +# Copyright (c) 2021-2022, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr From e9576133fdf4ac15b392af8d1c094960dd8670aa Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Sun, 10 Jul 2022 22:06:01 -0700 Subject: [PATCH 29/32] Fix import --- python/cudf/cudf/_lib/lists.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 130f4d48cf2..45253704b55 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -15,7 +15,7 @@ from cudf._lib.cpp.lists.combine cimport ( from cudf._lib.cpp.lists.count_elements cimport ( count_elements as cpp_count_elements, ) -from cudf._lib.cpp.lists.distinct cimport distinct as cpp_distinct +from cudf._lib.cpp.lists.stream_compaction cimport distinct as cpp_distinct from cudf._lib.cpp.lists.explode cimport explode_outer as cpp_explode_outer from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.lists.sorting cimport sort_lists as cpp_sort_lists From 1f41fa5f2f095d07a43de43bb6773ce21d5ca6fb Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 11 Jul 2022 06:14:07 -0700 Subject: [PATCH 30/32] Fix style --- python/cudf/cudf/_lib/lists.pyx | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 45253704b55..6e556db0b5e 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -15,10 +15,10 @@ from cudf._lib.cpp.lists.combine cimport ( from cudf._lib.cpp.lists.count_elements cimport ( count_elements as cpp_count_elements, ) -from cudf._lib.cpp.lists.stream_compaction cimport distinct as cpp_distinct from cudf._lib.cpp.lists.explode cimport explode_outer as cpp_explode_outer from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view from cudf._lib.cpp.lists.sorting cimport sort_lists as cpp_sort_lists +from cudf._lib.cpp.lists.stream_compaction cimport distinct as cpp_distinct from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view From a0440ec6011a90a89288e7d0d4782b4377216283 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 11 Jul 2022 11:37:27 -0700 Subject: [PATCH 31/32] Rename function --- python/cudf/cudf/_lib/lists.pyx | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index 6e556db0b5e..d7e171b8860 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -73,12 +73,11 @@ def explode_outer( return columns_from_unique_ptr(move(c_result)) -def drop_list_duplicates(Column col, bool nulls_equal, bool nans_all_equal): - """ - nans_all_equal == True indicates that libcudf should treat any two elements - from {+nan, -nan} as equal, and as unequal otherwise. - nulls_equal == True indicates that libcudf should treat any two nulls as - equal, and as unequal otherwise. +def distinct(Column col, bool nulls_equal, bool nans_all_equal): + """ + nulls_equal == True indicates that libcudf should treat any two nulls as equal, and as unequal otherwise. + nans_all_equal == True indicates that libcudf should treat any two elements from {-nan, +nan} as equal, + and as unequal otherwise. """ cdef shared_ptr[lists_column_view] list_view = ( make_shared[lists_column_view](col.view()) From 7323122ef4b67e4ad6994e519ee3add402e129ad Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Mon, 11 Jul 2022 11:59:47 -0700 Subject: [PATCH 32/32] Fix style --- python/cudf/cudf/_lib/lists.pyx | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx index d7e171b8860..581207c97a5 100644 --- a/python/cudf/cudf/_lib/lists.pyx +++ b/python/cudf/cudf/_lib/lists.pyx @@ -74,10 +74,11 @@ def explode_outer( def distinct(Column col, bool nulls_equal, bool nans_all_equal): - """ - nulls_equal == True indicates that libcudf should treat any two nulls as equal, and as unequal otherwise. - nans_all_equal == True indicates that libcudf should treat any two elements from {-nan, +nan} as equal, - and as unequal otherwise. + """ + nulls_equal == True indicates that libcudf should treat any two nulls as + equal, and as unequal otherwise. + nans_all_equal == True indicates that libcudf should treat any two + elements from {-nan, +nan} as equal, and as unequal otherwise. """ cdef shared_ptr[lists_column_view] list_view = ( make_shared[lists_column_view](col.view())