diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu index 8cde6e0a7ed..760fcf4bb6b 100644 --- a/cpp/src/stream_compaction/distinct_count.cu +++ b/cpp/src/stream_compaction/distinct_count.cu @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -125,9 +126,8 @@ cudf::size_type distinct_count(table_view const& keys, null_equality nulls_equal, rmm::cuda_stream_view stream) { - auto table_ptr = cudf::table_device_view::create(keys, stream); - auto const num_rows = table_ptr->num_rows(); - auto const has_null = nullate::DYNAMIC{cudf::has_nulls(keys)}; + auto const num_rows = keys.num_rows(); + auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(keys)}; hash_map_type key_map{compute_hash_table_size(num_rows), cuco::empty_key{COMPACTION_EMPTY_KEY_SENTINEL}, @@ -135,23 +135,39 @@ cudf::size_type distinct_count(table_view const& keys, detail::hash_table_allocator_type{default_allocator{}, stream}, stream.value()}; - compaction_hash hash_key{has_null, *table_ptr}; - row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal); + auto const preprocessed_input = + cudf::experimental::row::hash::preprocessed_table::create(keys, stream); + + auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input); + auto const hash_key = experimental::compaction_hash(row_hasher.device_hasher(has_nulls)); + + auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input); + auto iter = cudf::detail::make_counting_transform_iterator( 0, [] __device__(size_type i) { return cuco::make_pair(i, i); }); - // when nulls are equal, insert non-null rows only to improve efficiency - if (nulls_equal == null_equality::EQUAL and has_null) { - thrust::counting_iterator stencil(0); - auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream); - row_validity pred{static_cast(row_bitmask.data())}; - - key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); - return key_map.get_size() + static_cast((null_count > 0) ? 1 : 0); + auto const comparator_helper = [&](auto const row_equal) { + // when nulls are equal, insert non-null rows only to improve efficiency + if (nulls_equal == null_equality::EQUAL and has_nulls) { + thrust::counting_iterator stencil(0); + auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream); + row_validity pred{static_cast(row_bitmask.data())}; + + key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value()); + return key_map.get_size() + static_cast(null_count > 0); + } + // otherwise, insert all + key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); + return key_map.get_size(); + }; + + if (cudf::detail::has_nested_columns(keys)) { + auto const row_equal = row_comp.equal_to(has_nulls, nulls_equal); + return comparator_helper(row_equal); + } else { + auto const row_equal = row_comp.equal_to(has_nulls, nulls_equal); + return comparator_helper(row_equal); } - // otherwise, insert all - key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value()); - return key_map.get_size(); } cudf::size_type distinct_count(column_view const& input, diff --git a/cpp/src/stream_compaction/stream_compaction_common.cuh b/cpp/src/stream_compaction/stream_compaction_common.cuh index 0970a99edad..02cef0e6467 100644 --- a/cpp/src/stream_compaction/stream_compaction_common.cuh +++ b/cpp/src/stream_compaction/stream_compaction_common.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -29,24 +29,6 @@ namespace cudf { namespace detail { -/** - * @brief Device callable to hash a given row. - */ -template -class compaction_hash { - public: - compaction_hash(Nullate has_nulls, table_device_view t) : _hash{has_nulls, t} {} - - __device__ inline auto operator()(size_type i) const noexcept - { - auto hash = _hash(i); - return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash; - } - - private: - row_hash _hash; -}; - namespace experimental { /** diff --git a/cpp/src/stream_compaction/stream_compaction_common.hpp b/cpp/src/stream_compaction/stream_compaction_common.hpp index 82dee50ee94..eb57a62fd71 100644 --- a/cpp/src/stream_compaction/stream_compaction_common.hpp +++ b/cpp/src/stream_compaction/stream_compaction_common.hpp @@ -41,7 +41,5 @@ using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor; -using row_hash = cudf::row_hasher; - } // namespace detail } // namespace cudf diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu index 8363ee8120b..c7c10438d7a 100644 --- a/cpp/src/stream_compaction/unique_count.cu +++ b/cpp/src/stream_compaction/unique_count.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022, NVIDIA CORPORATION. + * Copyright (c) 2022-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -26,6 +26,7 @@ #include #include #include +#include #include #include @@ -70,14 +71,24 @@ cudf::size_type unique_count(table_view const& keys, null_equality nulls_equal, rmm::cuda_stream_view stream) { - auto table_ptr = cudf::table_device_view::create(keys, stream); - row_equality_comparator comp( - nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal); - return thrust::count_if( - rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(keys.num_rows()), - [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); }); + auto const row_comp = cudf::experimental::row::equality::self_comparator(keys, stream); + if (cudf::detail::has_nested_columns(keys)) { + auto const comp = + row_comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(keys.num_rows()), + [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); }); + } else { + auto const comp = + row_comp.equal_to(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal); + return thrust::count_if( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(keys.num_rows()), + [comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); }); + } } cudf::size_type unique_count(column_view const& input, diff --git a/cpp/tests/stream_compaction/distinct_count_tests.cpp b/cpp/tests/stream_compaction/distinct_count_tests.cpp index 31bbd43c78d..c7b6d36c538 100644 --- a/cpp/tests/stream_compaction/distinct_count_tests.cpp +++ b/cpp/tests/stream_compaction/distinct_count_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,12 +24,18 @@ #include #include #include +#include #include #include #include #include +using lists_col = cudf::test::lists_column_wrapper; +using structs_col = cudf::test::structs_column_wrapper; + +using cudf::test::iterators::nulls_at; + using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; @@ -306,3 +312,48 @@ TEST_F(DistinctCount, TableWithStringColumnWithNull) EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL)); EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL)); } + +TEST_F(DistinctCount, NullableLists) +{ + auto const keys = lists_col{ + {{}, {1, 1}, {1}, {} /*NULL*/, {1}, {} /*NULL*/, {2}, {2, 1}, {2}, {2, 2}, {}, {2, 2}}, + nulls_at({3, 5})}; + auto const input = cudf::table_view{{keys}}; + + EXPECT_EQ(7, cudf::distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(8, cudf::distinct_count(input, null_equality::UNEQUAL)); +} + +TEST_F(DistinctCount, NullableStructOfStructs) +{ + // +-----------------+ + // | s1{s2{a,b}, c} | + // +-----------------+ + // 0 | { {1, 1}, 5} | + // 1 | { Null, 4} | + // 2 | { {1, 1}, 5} | // Same as 0 + // 3 | { {1, 2}, 4} | + // 4 | { Null, 6} | + // 5 | { Null, 4} | // Same as 4 + // 6 | Null | // Same as 6 + // 7 | { {2, 1}, 5} | + // 8 | Null | + + auto const keys = [&] { + auto a = cudf::test::fixed_width_column_wrapper{1, XXX, 1, 1, XXX, XXX, 0, 2, 0}; + auto b = cudf::test::fixed_width_column_wrapper{1, XXX, 1, 2, XXX, XXX, 0, 1, 0}; + auto s2 = structs_col{{a, b}, nulls_at({1, 4, 5})}; + + auto c = cudf::test::fixed_width_column_wrapper{5, 4, 5, 4, 6, 4, 0, 5, 0}; + std::vector> s1_children; + s1_children.emplace_back(s2.release()); + s1_children.emplace_back(c.release()); + auto const null_it = nulls_at({6, 8}); + return structs_col(std::move(s1_children), std::vector{null_it, null_it + 9}); + }(); + + auto const input = cudf::table_view{{keys}}; + + EXPECT_EQ(6, cudf::distinct_count(input, null_equality::EQUAL)); + EXPECT_EQ(8, cudf::distinct_count(input, null_equality::UNEQUAL)); +} diff --git a/cpp/tests/stream_compaction/unique_count_tests.cpp b/cpp/tests/stream_compaction/unique_count_tests.cpp index 591fe042592..26a9ca26d2a 100644 --- a/cpp/tests/stream_compaction/unique_count_tests.cpp +++ b/cpp/tests/stream_compaction/unique_count_tests.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2022, NVIDIA CORPORATION. + * Copyright (c) 2019-2023, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -24,12 +24,18 @@ #include #include #include +#include #include #include #include #include +using lists_col = cudf::test::lists_column_wrapper; +using structs_col = cudf::test::structs_column_wrapper; + +using cudf::test::iterators::nulls_at; + using cudf::nan_policy; using cudf::null_equality; using cudf::null_policy; @@ -237,3 +243,48 @@ TEST_F(UniqueCount, EmptyColumn) constexpr auto expected = 0; EXPECT_EQ(expected, cudf::unique_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL)); } + +TEST_F(UniqueCount, NullableLists) +{ + auto const keys = lists_col{ + {{}, {}, {1, 1}, {1}, {1}, {} /*NULL*/, {} /*NULL*/, {2}, {2}, {2, 1}, {2, 2}, {2, 2}}, + nulls_at({5, 6})}; + auto const input = cudf::table_view{{keys}}; + + EXPECT_EQ(7, cudf::unique_count(input, null_equality::EQUAL)); + EXPECT_EQ(8, cudf::unique_count(input, null_equality::UNEQUAL)); +} + +TEST_F(UniqueCount, NullableStructOfStructs) +{ + // +-----------------+ + // | s1{s2{a,b}, c} | + // +-----------------+ + // 0 | { {1, 1}, 5} | + // 1 | { {1, 1}, 5} | // Same as 0 + // 2 | { {1, 2}, 4} | + // 3 | { Null, 6} | + // 4 | { Null, 4} | + // 5 | { Null, 4} | // Same as 4 + // 6 | Null | + // 7 | Null | // Same as 6 + // 8 | { {2, 1}, 5} | + + auto const keys = [&] { + auto a = cudf::test::fixed_width_column_wrapper{1, 1, 1, XXX, XXX, XXX, 2, 1, 2}; + auto b = cudf::test::fixed_width_column_wrapper{1, 1, 2, XXX, XXX, XXX, 2, 1, 1}; + auto s2 = structs_col{{a, b}, nulls_at({3, 4, 5})}; + + auto c = cudf::test::fixed_width_column_wrapper{5, 5, 4, 6, 4, 4, 3, 3, 5}; + std::vector> s1_children; + s1_children.emplace_back(s2.release()); + s1_children.emplace_back(c.release()); + auto const null_it = nulls_at({6, 7}); + return structs_col(std::move(s1_children), std::vector{null_it, null_it + 9}); + }(); + + auto const input = cudf::table_view{{keys}}; + + EXPECT_EQ(6, cudf::unique_count(input, null_equality::EQUAL)); + EXPECT_EQ(8, cudf::unique_count(input, null_equality::UNEQUAL)); +}