diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp index 749badc715d..149c6ad7219 100644 --- a/cpp/benchmarks/stream_compaction/distinct.cpp +++ b/cpp/benchmarks/stream_compaction/distinct.cpp @@ -19,6 +19,7 @@ #include #include +#include #include #include @@ -55,3 +56,43 @@ NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type)) .set_name("distinct") .set_type_axes_names({"Type"}) .add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000}); + +template +void nvbench_distinct_list(nvbench::state& state, nvbench::type_list) +{ + cudf::rmm_pool_raii pool_raii; + + auto const size = state.get_int64("ColumnSize"); + auto const dtype = cudf::type_to_id(); + double const null_frequency = state.get_float64("null_frequency"); + + data_profile table_data_profile; + if (dtype == cudf::type_id::LIST) { + table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 4); + table_data_profile.set_distribution_params( + cudf::type_id::INT32, distribution_id::UNIFORM, 0, 4); + table_data_profile.set_list_depth(1); + } else { + // We're comparing distinct() on a non-nested column to that on a list column with the same + // number of distinct rows. The max list size is 4 and the number of distinct values in the + // list's child is 5. So the number of distinct rows in the list = 1 + 5 + 5^2 + 5^3 + 5^4 = 781 + // We want this column to also have 781 distinct values. + table_data_profile.set_distribution_params(dtype, distribution_id::UNIFORM, 0, 781); + } + table_data_profile.set_null_frequency(null_frequency); + + auto const table = create_random_table( + {dtype}, table_size_bytes{static_cast(size)}, table_data_profile, 0); + + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + auto result = cudf::detail::distinct(*table, {0}, cudf::null_equality::EQUAL, stream_view); + }); +} + +NVBENCH_BENCH_TYPES(nvbench_distinct_list, + NVBENCH_TYPE_AXES(nvbench::type_list)) + .set_name("distinct_list") + .set_type_axes_names({"Type"}) + .add_float64_axis("null_frequency", {0.0, 0.1}) + .add_int64_axis("ColumnSize", {100'000'000}); diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp index e8e100aaec5..9958fa8f3a4 100644 --- a/cpp/include/cudf/detail/hashing.hpp +++ b/cpp/include/cudf/detail/hashing.hpp @@ -33,19 +33,20 @@ namespace detail { std::unique_ptr hash( table_view const& input, hash_id hash_function = hash_id::HASH_MURMUR3, - uint32_t seed = 0, + uint32_t seed = cudf::DEFAULT_HASH_SEED, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); std::unique_ptr murmur_hash3_32( table_view const& input, + uint32_t seed = cudf::DEFAULT_HASH_SEED, rmm::cuda_stream_view stream = rmm::cuda_stream_default, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); template