Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Update distinct/unique_count to experimental::row hasher/comparator #12776

Merged
Merged
Show file tree
Hide file tree
Changes from 23 commits
Commits
Show all changes
26 commits
Select commit Hold shift + click to select a range
4a8085a
building equality::self_comparator
divyegala Feb 2, 2023
f71d161
two table comp
divyegala Feb 2, 2023
3ca298c
copyright years
divyegala Feb 2, 2023
7c167a7
centralizing repeated logic
divyegala Feb 2, 2023
0ceb79e
address review to create functors
divyegala Feb 3, 2023
37e7326
updating has_nested_columns docs
divyegala Feb 3, 2023
b44f603
Merge remote-tracking branch 'upstream/branch-23.04' into equality-co…
divyegala Feb 3, 2023
c2ff1fc
address review for underscore prefixes in structs
divyegala Feb 7, 2023
c2ca8ee
Merge remote-tracking branch 'upstream/branch-23.04' into equality-co…
divyegala Feb 7, 2023
ffdf10c
Merge remote-tracking branch 'upstream/branch-23.04' into equality-co…
divyegala Feb 8, 2023
53e918f
add rank
divyegala Feb 8, 2023
65e2bce
fix compile times for rank
divyegala Feb 8, 2023
c6bc7f5
Merge remote-tracking branch 'upstream/branch-23.04' into equality-co…
divyegala Feb 8, 2023
1344e33
Apply suggestions from code review
divyegala Feb 11, 2023
4123379
address review
divyegala Feb 11, 2023
26f38b3
Merge remote-tracking branch 'upstream/branch-23.04' into equality-co…
divyegala Feb 11, 2023
9d0f7a6
address review, mark members of functors as private
divyegala Feb 11, 2023
fe41be8
removing partitioning
divyegala Feb 11, 2023
02dd5c5
simplify lists/contains since it already has a nested-type dispatch m…
divyegala Feb 12, 2023
5db4d03
Merge branch 'branch-23.04' into equality-comp-fast-path
divyegala Feb 13, 2023
03d754d
passing tests
divyegala Feb 14, 2023
22d5f90
copyright year
divyegala Feb 14, 2023
76af717
Merge branch 'branch-23.04' into stream_compaction-row-hasher
divyegala Feb 16, 2023
0ac7d8d
Apply suggestions from code review
divyegala Feb 16, 2023
d882693
Merge remote-tracking branch 'upstream/branch-23.04' into stream_comp…
divyegala Mar 7, 2023
e6c41a5
review change
divyegala Mar 7, 2023
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
50 changes: 34 additions & 16 deletions cpp/src/stream_compaction/distinct_count.cu
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
#include <cudf/detail/sorting.hpp>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/stream_compaction.hpp>
#include <cudf/table/experimental/row_operators.cuh>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

Expand Down Expand Up @@ -125,33 +126,50 @@ cudf::size_type distinct_count(table_view const& keys,
null_equality nulls_equal,
rmm::cuda_stream_view stream)
{
auto table_ptr = cudf::table_device_view::create(keys, stream);
auto const num_rows = table_ptr->num_rows();
auto const has_null = nullate::DYNAMIC{cudf::has_nulls(keys)};
auto const num_rows = keys.num_rows();
auto const has_nulls = nullate::DYNAMIC{cudf::has_nested_nulls(keys)};

hash_map_type key_map{compute_hash_table_size(num_rows),
cuco::empty_key{COMPACTION_EMPTY_KEY_SENTINEL},
cuco::empty_value{COMPACTION_EMPTY_VALUE_SENTINEL},
detail::hash_table_allocator_type{default_allocator<char>{}, stream},
stream.value()};

compaction_hash hash_key{has_null, *table_ptr};
row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal);
// compaction_hash hash_key{has_null, *table_ptr};
// row_equality_comparator row_equal(has_null, *table_ptr, *table_ptr, nulls_equal);
divyegala marked this conversation as resolved.
Show resolved Hide resolved
auto const preprocessed_input =
cudf::experimental::row::hash::preprocessed_table::create(keys, stream);

auto const row_hasher = cudf::experimental::row::hash::row_hasher(preprocessed_input);
auto const hash_key = experimental::compaction_hash(row_hasher.device_hasher(has_nulls));

auto const row_comp = cudf::experimental::row::equality::self_comparator(preprocessed_input);

auto iter = cudf::detail::make_counting_transform_iterator(
0, [] __device__(size_type i) { return cuco::make_pair(i, i); });

// when nulls are equal, insert non-null rows only to improve efficiency
if (nulls_equal == null_equality::EQUAL and has_null) {
thrust::counting_iterator<size_type> stencil(0);
auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream);
row_validity pred{static_cast<bitmask_type const*>(row_bitmask.data())};

key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value());
return key_map.get_size() + static_cast<std::size_t>((null_count > 0) ? 1 : 0);
auto const comparator_helper = [&](auto const row_equal) {
vyasr marked this conversation as resolved.
Show resolved Hide resolved
// when nulls are equal, insert non-null rows only to improve efficiency
if (nulls_equal == null_equality::EQUAL and has_nulls) {
thrust::counting_iterator<size_type> stencil(0);
auto const [row_bitmask, null_count] = cudf::detail::bitmask_or(keys, stream);
row_validity pred{static_cast<bitmask_type const*>(row_bitmask.data())};

key_map.insert_if(iter, iter + num_rows, stencil, pred, hash_key, row_equal, stream.value());
return key_map.get_size() + static_cast<std::size_t>((null_count > 0) ? 1 : 0);
divyegala marked this conversation as resolved.
Show resolved Hide resolved
}
// otherwise, insert all
key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value());
return key_map.get_size();
};

if (cudf::detail::has_nested_columns(keys)) {
auto const row_equal = row_comp.equal_to<true>(has_nulls, nulls_equal);
return comparator_helper(row_equal);
} else {
auto const row_equal = row_comp.equal_to<false>(has_nulls, nulls_equal);
return comparator_helper(row_equal);
}
// otherwise, insert all
key_map.insert(iter, iter + num_rows, hash_key, row_equal, stream.value());
return key_map.get_size();
}

cudf::size_type distinct_count(column_view const& input,
Expand Down
20 changes: 1 addition & 19 deletions cpp/src/stream_compaction/stream_compaction_common.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -29,24 +29,6 @@
namespace cudf {
namespace detail {

/**
* @brief Device callable to hash a given row.
*/
template <typename Nullate>
class compaction_hash {
public:
compaction_hash(Nullate has_nulls, table_device_view t) : _hash{has_nulls, t} {}

__device__ inline auto operator()(size_type i) const noexcept
{
auto hash = _hash(i);
return (hash == COMPACTION_EMPTY_KEY_SENTINEL) ? (hash - 1) : hash;
}

private:
row_hash _hash;
};

namespace experimental {

/**
Expand Down
2 changes: 0 additions & 2 deletions cpp/src/stream_compaction/stream_compaction_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -41,7 +41,5 @@ using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allo
using hash_map_type =
cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;

using row_hash = cudf::row_hasher<default_hash, cudf::nullate::DYNAMIC>;

} // namespace detail
} // namespace cudf
31 changes: 22 additions & 9 deletions cpp/src/stream_compaction/unique_count.cu
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
* Copyright (c) 2022-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -26,6 +26,7 @@
#include <cudf/detail/sorting.hpp>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/stream_compaction.hpp>
#include <cudf/table/experimental/row_operators.cuh>
#include <cudf/table/table_view.hpp>
#include <cudf/utilities/type_dispatcher.hpp>

Expand Down Expand Up @@ -70,14 +71,26 @@ cudf::size_type unique_count(table_view const& keys,
null_equality nulls_equal,
rmm::cuda_stream_view stream)
{
auto table_ptr = cudf::table_device_view::create(keys, stream);
row_equality_comparator comp(
nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal);
return thrust::count_if(
rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
[comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
auto const row_comp = cudf::experimental::row::equality::self_comparator(keys, stream);
// row_equality_comparator comp(
// nullate::DYNAMIC{cudf::has_nulls(keys)}, *table_ptr, *table_ptr, nulls_equal);
divyegala marked this conversation as resolved.
Show resolved Hide resolved
if (cudf::detail::has_nested_columns(keys)) {
auto const comp =
row_comp.equal_to<true>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
return thrust::count_if(
rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
[comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I assume that we can't capture the comparator by reference because it's a host object that needs to be copied to device for the lambda?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, that's correct

} else {
auto const comp =
row_comp.equal_to<false>(nullate::DYNAMIC{has_nested_nulls(keys)}, nulls_equal);
return thrust::count_if(
rmm::exec_policy(stream),
thrust::counting_iterator<cudf::size_type>(0),
thrust::counting_iterator<cudf::size_type>(keys.num_rows()),
[comp] __device__(cudf::size_type i) { return (i == 0 or not comp(i, i - 1)); });
vyasr marked this conversation as resolved.
Show resolved Hide resolved
}
}

cudf::size_type unique_count(column_view const& input,
Expand Down
53 changes: 52 additions & 1 deletion cpp/tests/stream_compaction/distinct_count_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,12 +24,18 @@
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/iterator_utilities.hpp>
#include <cudf_test/table_utilities.hpp>
#include <cudf_test/type_lists.hpp>

#include <algorithm>
#include <cmath>

using lists_col = cudf::test::lists_column_wrapper<int32_t>;
using structs_col = cudf::test::structs_column_wrapper;

using cudf::test::iterators::nulls_at;

using cudf::nan_policy;
using cudf::null_equality;
using cudf::null_policy;
Expand Down Expand Up @@ -306,3 +312,48 @@ TEST_F(DistinctCount, TableWithStringColumnWithNull)
EXPECT_EQ(9, cudf::distinct_count(input, null_equality::EQUAL));
EXPECT_EQ(10, cudf::distinct_count(input, null_equality::UNEQUAL));
}

TEST_F(DistinctCount, NullableLists)
{
auto const keys = lists_col{
{{}, {1, 1}, {1}, {} /*NULL*/, {1}, {} /*NULL*/, {2}, {2, 1}, {2}, {2, 2}, {}, {2, 2}},
nulls_at({3, 5})};
auto const input = cudf::table_view{{keys}};

EXPECT_EQ(7, cudf::distinct_count(input, null_equality::EQUAL));
EXPECT_EQ(8, cudf::distinct_count(input, null_equality::UNEQUAL));
}

TEST_F(DistinctCount, NullableStructOfStructs)
{
// +-----------------+
// | s1{s2{a,b}, c} |
// +-----------------+
// 0 | { {1, 1}, 5} |
// 1 | { Null, 4} |
// 2 | { {1, 1}, 5} | // Same as 0
// 3 | { {1, 2}, 4} |
// 4 | { Null, 6} |
// 5 | { Null, 4} | // Same as 4
// 6 | Null | // Same as 6
// 7 | { {2, 1}, 5} |
// 8 | Null |

auto const keys = [&] {
auto a = cudf::test::fixed_width_column_wrapper<int32_t>{1, XXX, 1, 1, XXX, XXX, 0, 2, 0};
auto b = cudf::test::fixed_width_column_wrapper<int32_t>{1, XXX, 1, 2, XXX, XXX, 0, 1, 0};
auto s2 = structs_col{{a, b}, nulls_at({1, 4, 5})};

auto c = cudf::test::fixed_width_column_wrapper<int32_t>{5, 4, 5, 4, 6, 4, 0, 5, 0};
std::vector<std::unique_ptr<cudf::column>> s1_children;
s1_children.emplace_back(s2.release());
s1_children.emplace_back(c.release());
auto const null_it = nulls_at({6, 8});
return structs_col(std::move(s1_children), std::vector<bool>{null_it, null_it + 9});
}();

auto const input = cudf::table_view{{keys}};

EXPECT_EQ(6, cudf::distinct_count(input, null_equality::EQUAL));
EXPECT_EQ(8, cudf::distinct_count(input, null_equality::UNEQUAL));
}
53 changes: 52 additions & 1 deletion cpp/tests/stream_compaction/unique_count_tests.cpp
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2022, NVIDIA CORPORATION.
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -24,12 +24,18 @@
#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/iterator_utilities.hpp>
#include <cudf_test/table_utilities.hpp>
#include <cudf_test/type_lists.hpp>

#include <algorithm>
#include <cmath>

using lists_col = cudf::test::lists_column_wrapper<int32_t>;
using structs_col = cudf::test::structs_column_wrapper;

using cudf::test::iterators::nulls_at;

using cudf::nan_policy;
using cudf::null_equality;
using cudf::null_policy;
Expand Down Expand Up @@ -237,3 +243,48 @@ TEST_F(UniqueCount, EmptyColumn)
constexpr auto expected = 0;
EXPECT_EQ(expected, cudf::unique_count(input_col, null_policy::EXCLUDE, nan_policy::NAN_IS_NULL));
}

TEST_F(UniqueCount, NullableLists)
{
auto const keys = lists_col{
{{}, {}, {1, 1}, {1}, {1}, {} /*NULL*/, {} /*NULL*/, {2}, {2}, {2, 1}, {2, 2}, {2, 2}},
nulls_at({5, 6})};
auto const input = cudf::table_view{{keys}};

EXPECT_EQ(7, cudf::unique_count(input, null_equality::EQUAL));
EXPECT_EQ(8, cudf::unique_count(input, null_equality::UNEQUAL));
}

TEST_F(UniqueCount, NullableStructOfStructs)
{
// +-----------------+
// | s1{s2{a,b}, c} |
// +-----------------+
// 0 | { {1, 1}, 5} |
// 1 | { {1, 1}, 5} | // Same as 0
// 2 | { {1, 2}, 4} |
// 3 | { Null, 6} |
// 4 | { Null, 4} |
// 5 | { Null, 4} | // Same as 4
// 6 | Null |
// 7 | Null | // Same as 6
// 8 | { {2, 1}, 5} |

auto const keys = [&] {
auto a = cudf::test::fixed_width_column_wrapper<int32_t>{1, 1, 1, XXX, XXX, XXX, 2, 1, 2};
auto b = cudf::test::fixed_width_column_wrapper<int32_t>{1, 1, 2, XXX, XXX, XXX, 2, 1, 1};
auto s2 = structs_col{{a, b}, nulls_at({3, 4, 5})};

auto c = cudf::test::fixed_width_column_wrapper<int32_t>{5, 5, 4, 6, 4, 4, 3, 3, 5};
std::vector<std::unique_ptr<cudf::column>> s1_children;
s1_children.emplace_back(s2.release());
s1_children.emplace_back(c.release());
auto const null_it = nulls_at({6, 7});
return structs_col(std::move(s1_children), std::vector<bool>{null_it, null_it + 9});
}();

auto const input = cudf::table_view{{keys}};

EXPECT_EQ(6, cudf::unique_count(input, null_equality::EQUAL));
EXPECT_EQ(8, cudf::unique_count(input, null_equality::UNEQUAL));
}