From 013e6bf01b4cd044857ac0e8f69ad481261ece90 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Thu, 27 Jun 2024 16:04:11 -0700 Subject: [PATCH 1/7] Add throughput metrics, such as bytes_per_second for google benchmarks or GlobalMem BW for nvbench, for reduction benchmarks --- cpp/benchmarks/CMakeLists.txt | 6 +- cpp/benchmarks/common/benchmark_utilities.cpp | 25 ++++++++ cpp/benchmarks/common/benchmark_utilities.hpp | 41 +++++++++++++ cpp/benchmarks/common/nvbench_utilities.cpp | 61 +++++++++++++++++++ cpp/benchmarks/common/nvbench_utilities.hpp | 31 ++++++++++ cpp/benchmarks/common/table_utilities.cpp | 41 +++++++++++++ cpp/benchmarks/common/table_utilities.hpp | 41 +++++++++++++ cpp/benchmarks/reduction/anyall.cpp | 8 ++- cpp/benchmarks/reduction/dictionary.cpp | 8 +++ cpp/benchmarks/reduction/minmax.cpp | 13 +++- cpp/benchmarks/reduction/rank.cpp | 16 ++++- cpp/benchmarks/reduction/reduce.cpp | 8 ++- cpp/benchmarks/reduction/scan.cpp | 19 ++++-- cpp/benchmarks/reduction/scan_structs.cpp | 25 ++++++-- 14 files changed, 328 insertions(+), 15 deletions(-) create mode 100644 cpp/benchmarks/common/benchmark_utilities.cpp create mode 100644 cpp/benchmarks/common/benchmark_utilities.hpp create mode 100644 cpp/benchmarks/common/nvbench_utilities.cpp create mode 100644 cpp/benchmarks/common/nvbench_utilities.hpp create mode 100644 cpp/benchmarks/common/table_utilities.cpp create mode 100644 cpp/benchmarks/common/table_utilities.hpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 8a48126e195..09cdeb943d2 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -41,7 +41,11 @@ target_include_directories( # Use an OBJECT library so we only compile these helper source files only once add_library( cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp" - synchronization/synchronization.cpp io/cuio_common.cpp + synchronization/synchronization.cpp + io/cuio_common.cpp + common/table_utilities.cpp + common/benchmark_utilities.cpp + common/nvbench_utilities.cpp ) target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $) add_custom_command( diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp new file mode 100644 index 00000000000..29336c3b0ef --- /dev/null +++ b/cpp/benchmarks/common/benchmark_utilities.cpp @@ -0,0 +1,25 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "benchmark_utilities.hpp" + +void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration) { + state.SetItemsProcessed(state.iterations() * items_processed_per_iteration); +} + +void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration) { + state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration); +} diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp new file mode 100644 index 00000000000..4f0b3794022 --- /dev/null +++ b/cpp/benchmarks/common/benchmark_utilities.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +/** + * @brief Sets the number of items processed during the benchmark. + * + * This function could be used instead of ::benchmark::State.SetItemsProcessed() + * to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration. + * + * @param state the benchmark state + * @param items_processed_per_iteration number of items processed per iteration + */ +void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration); + +/** + * @brief Sets the number of bytes processed during the benchmark. + * + * This function could be used instead of ::benchmark::State.SetItemsProcessed() + * to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration. + * + * @param state the benchmark state + * @param bytes_processed_per_iteration number of bytes processed per iteration + */ +void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration); diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp new file mode 100644 index 00000000000..ca2f7f5d698 --- /dev/null +++ b/cpp/benchmarks/common/nvbench_utilities.cpp @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nvbench_utilities.hpp" + +#include + +// This function is copied over from https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224. +void set_throughputs(nvbench::state& state) +{ + double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); + + if (const auto items = state.get_element_count(); items != 0) + { + auto &summ = state.add_summary("nv/cold/bw/item_rate"); + summ.set_string("name", "Elem/s"); + summ.set_string("hint", "item_rate"); + summ.set_string("description", "Number of input elements processed per second"); + summ.set_float64("value", static_cast(items) / avg_cuda_time); + } + + if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0) + { + const auto avg_used_gmem_bw = static_cast(bytes) / avg_cuda_time; + { + auto &summ = state.add_summary("nv/cold/bw/global/bytes_per_second"); + summ.set_string("name", "GlobalMem BW"); + summ.set_string("hint", "byte_rate"); + summ.set_string("description", + "Number of bytes read/written per second to the CUDA " + "device's global memory"); + summ.set_float64("value", avg_used_gmem_bw); + } + + { + const auto peak_gmem_bw = + static_cast(state.get_device()->get_global_memory_bus_bandwidth()); + + auto &summ = state.add_summary("nv/cold/bw/global/utilization"); + summ.set_string("name", "BWUtil"); + summ.set_string("hint", "percentage"); + summ.set_string("description", + "Global device memory utilization as a percentage of the " + "device's peak bandwidth"); + summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw); + } + } // bandwidth +} diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp new file mode 100644 index 00000000000..19997b6dd4d --- /dev/null +++ b/cpp/benchmarks/common/nvbench_utilities.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +namespace nvbench { +struct state; +} + +/** + * @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the + * nvbench results summary. + * + * This function could be used to work around a known issue that the throughput statistics + * should be added before the nvbench::state.exec() call, otherwise they will not be printed + * in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details. + */ +void set_throughputs(nvbench::state& state); diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp new file mode 100644 index 00000000000..c35af1e0b76 --- /dev/null +++ b/cpp/benchmarks/common/table_utilities.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "table_utilities.hpp" + +#include +#include + +int64_t estimate_size(std::unique_ptr column) +{ + std::vector> columns; + columns.emplace_back(std::move(column)); + cudf::table table{std::move(columns)}; + return estimate_size(table.view()); +} + +int64_t estimate_size(cudf::table_view const& view) +{ + // Compute the size in bits for each row. + auto const row_sizes = cudf::row_bit_count(view); + // Accumulate the row sizes to compute a sum. + auto const agg = cudf::make_sum_aggregation(); + cudf::data_type sum_dtype{cudf::type_id::INT64}; + auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype); + auto const total_size_in_bits = static_cast*>(total_size_scalar.get())->value(); + // Convert the size in bits to the size in bytes. + return static_cast(static_cast(total_size_in_bits) / 8); +} diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp new file mode 100644 index 00000000000..f2c9535f411 --- /dev/null +++ b/cpp/benchmarks/common/table_utilities.hpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +/** + * @brief Estimates the column size in bytes. + * + * @remark As this function internally uses cudf::row_bit_count() to estimate each row size + * and accumulates them, the returned estimate may be an inexact approximation in some + * cases. See cudf::row_bit_count() for more details. + * + * @param column The column to estimate its size + */ +int64_t estimate_size(std::unique_ptr column); + +/** + * @brief Estimates the table size in bytes. + * + * @remark As this function internally uses cudf::row_bit_count() to estimate each row size + * and accumulates them, the returned estimate may be an inexact approximation in some + * cases. See cudf::row_bit_count() for more details. + * + * @param view The view to estimate its size + */ +int64_t estimate_size(cudf::table_view const& view); diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp index 8b1e71c1585..13ddbf3605e 100644 --- a/cpp/benchmarks/reduction/anyall.cpp +++ b/cpp/benchmarks/reduction/anyall.cpp @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -34,7 +36,7 @@ void BM_reduction_anyall(benchmark::State& state, auto const dtype = cudf::type_to_id(); data_profile const profile = data_profile_builder().no_validity().distribution( dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100); - auto const values = create_random_column(dtype, row_count{column_size}, profile); + auto values = create_random_column(dtype, row_count{column_size}, profile); cudf::data_type output_dtype{cudf::type_id::BOOL8}; @@ -42,6 +44,10 @@ void BM_reduction_anyall(benchmark::State& state, cuda_event_timer timer(state, true); auto result = cudf::reduce(*values, *agg, output_dtype); } + + // The benchmark takes a column and produces one scalar. + set_items_processed(state, column_size + 1); + set_bytes_processed(state, estimate_size(std::move(values)) + cudf::size_of(output_dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp index c1c44c919ac..f715277aae7 100644 --- a/cpp/benchmarks/reduction/dictionary.cpp +++ b/cpp/benchmarks/reduction/dictionary.cpp @@ -14,6 +14,7 @@ * limitations under the License. */ +#include #include #include #include @@ -52,6 +53,13 @@ void BM_reduction_dictionary(benchmark::State& state, cuda_event_timer timer(state, true); auto result = cudf::reduce(*values, *agg, output_dtype); } + + // The benchmark takes a column and produces two scalars. + set_items_processed(state, column_size + 1); + + // We don't set the metrics for the size read/written as row_bit_count() doesn't + // support the dictionary type yet (and so is estimate_size()). + // See https://github.com/rapidsai/cudf/issues/16121 for details. } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp index 963c26692e7..07efb016877 100644 --- a/cpp/benchmarks/reduction/minmax.cpp +++ b/cpp/benchmarks/reduction/minmax.cpp @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -28,14 +30,19 @@ template void BM_reduction(benchmark::State& state) { cudf::size_type const column_size{(cudf::size_type)state.range(0)}; - auto const dtype = cudf::type_to_id(); - auto const input_column = - create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity()); + auto const dtype_id = cudf::type_to_id(); + auto input_column = + create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity()); for (auto _ : state) { cuda_event_timer timer(state, true); auto result = cudf::minmax(*input_column); } + + // The benchmark takes a column and produces two scalars. + set_items_processed(state, column_size + 2); + cudf::data_type dtype = cudf::data_type{dtype_id}; + set_bytes_processed(state, estimate_size(std::move(input_column)) + 2 * cudf::size_of(dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp index e55f3b9e09f..4878bacc3f4 100644 --- a/cpp/benchmarks/reduction/rank.cpp +++ b/cpp/benchmarks/reduction/rank.cpp @@ -15,6 +15,8 @@ */ #include +#include +#include #include #include @@ -39,11 +41,23 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_listview(), 2); cudf::column_view input(new_tbl->view().column(0)); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + int64_t result_size = 0; + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { rmm::cuda_stream_view stream_view{launch.get_stream()}; + timer.start(); auto result = cudf::detail::inclusive_dense_rank_scan( input, stream_view, rmm::mr::get_current_device_resource()); + timer.stop(); + + // Estimating the result size will launch a kernel. Do not include it in measuring time. + result_size += estimate_size(std::move(result)); }); + + state.add_element_count(new_tbl->num_rows()); + state.add_global_memory_reads(estimate_size(new_tbl->view())); + state.add_global_memory_writes(result_size); + + set_throughputs(state); } using data_type = nvbench::type_list; diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp index 5bd3e2e3bba..0d8f8285428 100644 --- a/cpp/benchmarks/reduction/reduce.cpp +++ b/cpp/benchmarks/reduction/reduce.cpp @@ -14,7 +14,9 @@ * limitations under the License. */ +#include #include +#include #include #include @@ -34,7 +36,7 @@ void BM_reduction(benchmark::State& state, std::unique_ptr(); data_profile const profile = data_profile_builder().no_validity().distribution(dtype, distribution_id::UNIFORM, 0, 100); - auto const input_column = create_random_column(dtype, row_count{column_size}, profile); + auto input_column = create_random_column(dtype, row_count{column_size}, profile); cudf::data_type output_dtype = (agg->kind == cudf::aggregation::MEAN || agg->kind == cudf::aggregation::VARIANCE || @@ -46,6 +48,10 @@ void BM_reduction(benchmark::State& state, std::unique_ptr #include +#include #include #include @@ -31,14 +33,23 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const dtype = cudf::type_to_id(); - auto const column = create_random_column(dtype, row_count{n_rows}); + auto column = create_random_column(dtype, row_count{n_rows}); if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0); + int64_t result_size = 0; for (auto _ : state) { - cuda_event_timer timer(state, true); - auto result = cudf::scan( - *column, *cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE); + std::unique_ptr result = nullptr; + { + cuda_event_timer timer(state, true); + result = cudf::scan( + *column, *cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE); + } + result_size = estimate_size(std::move(result)); } + + // The benchmark takes a column and produces a new column of the same size as input. + set_items_processed(state, n_rows * 2); + set_bytes_processed(state, estimate_size(std::move(column)) + result_size); } #define SCAN_BENCHMARK_DEFINE(name, type, nulls) \ diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp index ee97b54fbef..cee25e33ef0 100644 --- a/cpp/benchmarks/reduction/scan_structs.cpp +++ b/cpp/benchmarks/reduction/scan_structs.cpp @@ -15,6 +15,8 @@ */ #include +#include +#include #include #include @@ -43,18 +45,33 @@ static void nvbench_structs_scan(nvbench::state& state) row_count{size}, profile); auto [null_mask, null_count] = create_random_null_mask(size, null_probability); - auto const input = cudf::make_structs_column( + auto input = cudf::make_structs_column( size, std::move(data_table->release()), null_count, std::move(null_mask)); + std::vector> columns; + columns.emplace_back(std::move(input)); + cudf::table input_table{std::move(columns)}; auto const agg = cudf::make_min_aggregation(); auto const null_policy = static_cast(state.get_int64("null_policy")); auto const stream = cudf::get_default_stream(); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { - auto const result = cudf::detail::scan_inclusive( - *input, *agg, null_policy, stream, rmm::mr::get_current_device_resource()); + int64_t result_size = 0; + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { + timer.start(); + auto result = cudf::detail::scan_inclusive( + input_table.view().column(0), *agg, null_policy, stream, rmm::mr::get_current_device_resource()); + timer.stop(); + + // Estimating the result size will launch a kernel. Do not include it in measuring time. + result_size += estimate_size(std::move(result)); }); + + state.add_element_count(input_table.num_rows()); + state.add_global_memory_reads(estimate_size(input_table.view())); + state.add_global_memory_writes(result_size); + + set_throughputs(state); } NVBENCH_BENCH(nvbench_structs_scan) From 2daeff561b3ab6b7317fc9abe8529f9c1f4ce497 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 28 Jun 2024 11:23:12 -0700 Subject: [PATCH 2/7] fix copyrights --- cpp/benchmarks/common/benchmark_utilities.cpp | 2 +- cpp/benchmarks/common/benchmark_utilities.hpp | 2 +- cpp/benchmarks/common/nvbench_utilities.cpp | 2 +- cpp/benchmarks/common/nvbench_utilities.hpp | 2 +- cpp/benchmarks/common/table_utilities.cpp | 2 +- cpp/benchmarks/common/table_utilities.hpp | 2 +- cpp/benchmarks/reduction/anyall.cpp | 2 +- cpp/benchmarks/reduction/dictionary.cpp | 2 +- cpp/benchmarks/reduction/minmax.cpp | 2 +- cpp/benchmarks/reduction/rank.cpp | 2 +- cpp/benchmarks/reduction/reduce.cpp | 2 +- cpp/benchmarks/reduction/scan.cpp | 2 +- cpp/benchmarks/reduction/scan_structs.cpp | 2 +- 13 files changed, 13 insertions(+), 13 deletions(-) diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp index 29336c3b0ef..38753e520f9 100644 --- a/cpp/benchmarks/common/benchmark_utilities.cpp +++ b/cpp/benchmarks/common/benchmark_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp index 4f0b3794022..4c3f0d04b0e 100644 --- a/cpp/benchmarks/common/benchmark_utilities.hpp +++ b/cpp/benchmarks/common/benchmark_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp index ca2f7f5d698..f1541e85374 100644 --- a/cpp/benchmarks/common/nvbench_utilities.cpp +++ b/cpp/benchmarks/common/nvbench_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp index 19997b6dd4d..270b0bed9ef 100644 --- a/cpp/benchmarks/common/nvbench_utilities.hpp +++ b/cpp/benchmarks/common/nvbench_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp index c35af1e0b76..f0e31655aa0 100644 --- a/cpp/benchmarks/common/table_utilities.cpp +++ b/cpp/benchmarks/common/table_utilities.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp index f2c9535f411..2e47828e8cd 100644 --- a/cpp/benchmarks/common/table_utilities.hpp +++ b/cpp/benchmarks/common/table_utilities.hpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. + * Copyright (c) 2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp index 13ddbf3605e..56311771c9c 100644 --- a/cpp/benchmarks/reduction/anyall.cpp +++ b/cpp/benchmarks/reduction/anyall.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp index f715277aae7..01591739695 100644 --- a/cpp/benchmarks/reduction/dictionary.cpp +++ b/cpp/benchmarks/reduction/dictionary.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp index 07efb016877..39583b0afba 100644 --- a/cpp/benchmarks/reduction/minmax.cpp +++ b/cpp/benchmarks/reduction/minmax.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp index 4878bacc3f4..fc35d43272a 100644 --- a/cpp/benchmarks/reduction/rank.cpp +++ b/cpp/benchmarks/reduction/rank.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp index 0d8f8285428..a2d00df5371 100644 --- a/cpp/benchmarks/reduction/reduce.cpp +++ b/cpp/benchmarks/reduction/reduce.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp index 6d6f93ad90b..a4a89c27727 100644 --- a/cpp/benchmarks/reduction/scan.cpp +++ b/cpp/benchmarks/reduction/scan.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2021-2023, NVIDIA CORPORATION. + * Copyright (c) 2021-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp index cee25e33ef0..55adfac2afc 100644 --- a/cpp/benchmarks/reduction/scan_structs.cpp +++ b/cpp/benchmarks/reduction/scan_structs.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. From 40804e27cd510f746e92105a23e7feb9a35d6cbe Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 28 Jun 2024 11:57:34 -0700 Subject: [PATCH 3/7] estimate_size() should take a column_view --- cpp/benchmarks/common/table_utilities.cpp | 7 ++----- cpp/benchmarks/common/table_utilities.hpp | 6 +++--- cpp/benchmarks/reduction/anyall.cpp | 4 ++-- cpp/benchmarks/reduction/minmax.cpp | 4 ++-- cpp/benchmarks/reduction/rank.cpp | 6 +++--- cpp/benchmarks/reduction/reduce.cpp | 4 ++-- cpp/benchmarks/reduction/scan.cpp | 6 +++--- cpp/benchmarks/reduction/scan_structs.cpp | 16 +++++++--------- 8 files changed, 24 insertions(+), 29 deletions(-) diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp index f0e31655aa0..0ac90e3dbf8 100644 --- a/cpp/benchmarks/common/table_utilities.cpp +++ b/cpp/benchmarks/common/table_utilities.cpp @@ -19,12 +19,9 @@ #include #include -int64_t estimate_size(std::unique_ptr column) +int64_t estimate_size(cudf::column_view const& col) { - std::vector> columns; - columns.emplace_back(std::move(column)); - cudf::table table{std::move(columns)}; - return estimate_size(table.view()); + return estimate_size( cudf::table_view( {col} ) ); } int64_t estimate_size(cudf::table_view const& view) diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp index 2e47828e8cd..ad7ac5dc451 100644 --- a/cpp/benchmarks/common/table_utilities.hpp +++ b/cpp/benchmarks/common/table_utilities.hpp @@ -25,9 +25,9 @@ * and accumulates them, the returned estimate may be an inexact approximation in some * cases. See cudf::row_bit_count() for more details. * - * @param column The column to estimate its size + * @param view The column view to estimate its size */ -int64_t estimate_size(std::unique_ptr column); +int64_t estimate_size(cudf::column_view const& view); /** * @brief Estimates the table size in bytes. @@ -36,6 +36,6 @@ int64_t estimate_size(std::unique_ptr column); * and accumulates them, the returned estimate may be an inexact approximation in some * cases. See cudf::row_bit_count() for more details. * - * @param view The view to estimate its size + * @param view The table view to estimate its size */ int64_t estimate_size(cudf::table_view const& view); diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp index 56311771c9c..e9d23881764 100644 --- a/cpp/benchmarks/reduction/anyall.cpp +++ b/cpp/benchmarks/reduction/anyall.cpp @@ -36,7 +36,7 @@ void BM_reduction_anyall(benchmark::State& state, auto const dtype = cudf::type_to_id(); data_profile const profile = data_profile_builder().no_validity().distribution( dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100); - auto values = create_random_column(dtype, row_count{column_size}, profile); + auto const values = create_random_column(dtype, row_count{column_size}, profile); cudf::data_type output_dtype{cudf::type_id::BOOL8}; @@ -47,7 +47,7 @@ void BM_reduction_anyall(benchmark::State& state, // The benchmark takes a column and produces one scalar. set_items_processed(state, column_size + 1); - set_bytes_processed(state, estimate_size(std::move(values)) + cudf::size_of(output_dtype)); + set_bytes_processed(state, estimate_size(values->view()) + cudf::size_of(output_dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp index 39583b0afba..050f2887221 100644 --- a/cpp/benchmarks/reduction/minmax.cpp +++ b/cpp/benchmarks/reduction/minmax.cpp @@ -31,7 +31,7 @@ void BM_reduction(benchmark::State& state) { cudf::size_type const column_size{(cudf::size_type)state.range(0)}; auto const dtype_id = cudf::type_to_id(); - auto input_column = + auto const input_column = create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity()); for (auto _ : state) { @@ -42,7 +42,7 @@ void BM_reduction(benchmark::State& state) // The benchmark takes a column and produces two scalars. set_items_processed(state, column_size + 2); cudf::data_type dtype = cudf::data_type{dtype_id}; - set_bytes_processed(state, estimate_size(std::move(input_column)) + 2 * cudf::size_of(dtype)); + set_bytes_processed(state, estimate_size(input_column->view()) + 2 * cudf::size_of(dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp index fc35d43272a..ee7f3ad0e66 100644 --- a/cpp/benchmarks/reduction/rank.cpp +++ b/cpp/benchmarks/reduction/rank.cpp @@ -50,11 +50,11 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_listview()); }); - state.add_element_count(new_tbl->num_rows()); - state.add_global_memory_reads(estimate_size(new_tbl->view())); + state.add_element_count(input.size()); + state.add_global_memory_reads(estimate_size(input)); state.add_global_memory_writes(result_size); set_throughputs(state); diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp index a2d00df5371..63c96f4fe9e 100644 --- a/cpp/benchmarks/reduction/reduce.cpp +++ b/cpp/benchmarks/reduction/reduce.cpp @@ -36,7 +36,7 @@ void BM_reduction(benchmark::State& state, std::unique_ptr(); data_profile const profile = data_profile_builder().no_validity().distribution(dtype, distribution_id::UNIFORM, 0, 100); - auto input_column = create_random_column(dtype, row_count{column_size}, profile); + auto const input_column = create_random_column(dtype, row_count{column_size}, profile); cudf::data_type output_dtype = (agg->kind == cudf::aggregation::MEAN || agg->kind == cudf::aggregation::VARIANCE || @@ -51,7 +51,7 @@ void BM_reduction(benchmark::State& state, std::unique_ptrview()) + cudf::size_of(output_dtype)); } #define concat(a, b, c) a##b##c diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp index a4a89c27727..d7d47a8c9f1 100644 --- a/cpp/benchmarks/reduction/scan.cpp +++ b/cpp/benchmarks/reduction/scan.cpp @@ -33,7 +33,7 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls) { cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; auto const dtype = cudf::type_to_id(); - auto column = create_random_column(dtype, row_count{n_rows}); + auto const column = create_random_column(dtype, row_count{n_rows}); if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0); int64_t result_size = 0; @@ -44,12 +44,12 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls) result = cudf::scan( *column, *cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE); } - result_size = estimate_size(std::move(result)); + result_size = estimate_size(result->view()); } // The benchmark takes a column and produces a new column of the same size as input. set_items_processed(state, n_rows * 2); - set_bytes_processed(state, estimate_size(std::move(column)) + result_size); + set_bytes_processed(state, estimate_size(column->view()) + result_size); } #define SCAN_BENCHMARK_DEFINE(name, type, nulls) \ diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp index 55adfac2afc..410843ab0f2 100644 --- a/cpp/benchmarks/reduction/scan_structs.cpp +++ b/cpp/benchmarks/reduction/scan_structs.cpp @@ -45,11 +45,9 @@ static void nvbench_structs_scan(nvbench::state& state) row_count{size}, profile); auto [null_mask, null_count] = create_random_null_mask(size, null_probability); - auto input = cudf::make_structs_column( + auto const input = cudf::make_structs_column( size, std::move(data_table->release()), null_count, std::move(null_mask)); - std::vector> columns; - columns.emplace_back(std::move(input)); - cudf::table input_table{std::move(columns)}; + auto input_view = input->view(); auto const agg = cudf::make_min_aggregation(); auto const null_policy = static_cast(state.get_int64("null_policy")); @@ -59,16 +57,16 @@ static void nvbench_structs_scan(nvbench::state& state) int64_t result_size = 0; state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { timer.start(); - auto result = cudf::detail::scan_inclusive( - input_table.view().column(0), *agg, null_policy, stream, rmm::mr::get_current_device_resource()); + auto const result = cudf::detail::scan_inclusive( + input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource()); timer.stop(); // Estimating the result size will launch a kernel. Do not include it in measuring time. - result_size += estimate_size(std::move(result)); + result_size += estimate_size(result->view()); }); - state.add_element_count(input_table.num_rows()); - state.add_global_memory_reads(estimate_size(input_table.view())); + state.add_element_count(input_view.size()); + state.add_global_memory_reads(estimate_size(input_view)); state.add_global_memory_writes(result_size); set_throughputs(state); From 951795e4e376f5a10ceb83f095294f0b8c605f18 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 28 Jun 2024 12:02:46 -0700 Subject: [PATCH 4/7] add missing roundup --- cpp/benchmarks/common/table_utilities.cpp | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp index 0ac90e3dbf8..20c7e7ffd74 100644 --- a/cpp/benchmarks/common/table_utilities.cpp +++ b/cpp/benchmarks/common/table_utilities.cpp @@ -19,6 +19,8 @@ #include #include +#include + int64_t estimate_size(cudf::column_view const& col) { return estimate_size( cudf::table_view( {col} ) ); @@ -34,5 +36,5 @@ int64_t estimate_size(cudf::table_view const& view) auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype); auto const total_size_in_bits = static_cast*>(total_size_scalar.get())->value(); // Convert the size in bits to the size in bytes. - return static_cast(static_cast(total_size_in_bits) / 8); + return static_cast(std::ceil(static_cast(total_size_in_bits) / 8)); } From d844bf14179db6fdc28f9e28a50cb63909823af5 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Fri, 28 Jun 2024 13:02:19 -0700 Subject: [PATCH 5/7] style fix --- cpp/benchmarks/CMakeLists.txt | 13 ++++++----- cpp/benchmarks/common/benchmark_utilities.cpp | 6 +++-- cpp/benchmarks/common/benchmark_utilities.hpp | 8 +++---- cpp/benchmarks/common/nvbench_utilities.cpp | 17 +++++++------- cpp/benchmarks/common/nvbench_utilities.hpp | 2 +- cpp/benchmarks/common/table_utilities.cpp | 7 +++--- cpp/benchmarks/common/table_utilities.hpp | 8 +++---- cpp/benchmarks/reduction/dictionary.cpp | 2 +- cpp/benchmarks/reduction/rank.cpp | 22 ++++++++++--------- cpp/benchmarks/reduction/scan_structs.cpp | 20 +++++++++-------- 10 files changed, 56 insertions(+), 49 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 09cdeb943d2..a5b248135c1 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -40,12 +40,13 @@ target_include_directories( # Use an OBJECT library so we only compile these helper source files only once add_library( - cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp" - synchronization/synchronization.cpp - io/cuio_common.cpp - common/table_utilities.cpp - common/benchmark_utilities.cpp - common/nvbench_utilities.cpp + cudf_benchmark_common OBJECT + "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp" + synchronization/synchronization.cpp + io/cuio_common.cpp + common/table_utilities.cpp + common/benchmark_utilities.cpp + common/nvbench_utilities.cpp ) target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $) add_custom_command( diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp index 38753e520f9..0b9fc17e779 100644 --- a/cpp/benchmarks/common/benchmark_utilities.cpp +++ b/cpp/benchmarks/common/benchmark_utilities.cpp @@ -16,10 +16,12 @@ #include "benchmark_utilities.hpp" -void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration) { +void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration) +{ state.SetItemsProcessed(state.iterations() * items_processed_per_iteration); } -void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration) { +void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration) +{ state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration); } diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp index 4c3f0d04b0e..c5c80e73674 100644 --- a/cpp/benchmarks/common/benchmark_utilities.hpp +++ b/cpp/benchmarks/common/benchmark_utilities.hpp @@ -20,10 +20,10 @@ /** * @brief Sets the number of items processed during the benchmark. - * + * * This function could be used instead of ::benchmark::State.SetItemsProcessed() * to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration. - * + * * @param state the benchmark state * @param items_processed_per_iteration number of items processed per iteration */ @@ -31,10 +31,10 @@ void set_items_processed(::benchmark::State& state, int64_t items_processed_per_ /** * @brief Sets the number of bytes processed during the benchmark. - * + * * This function could be used instead of ::benchmark::State.SetItemsProcessed() * to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration. - * + * * @param state the benchmark state * @param bytes_processed_per_iteration number of bytes processed per iteration */ diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp index f1541e85374..c740eaa52f4 100644 --- a/cpp/benchmarks/common/nvbench_utilities.cpp +++ b/cpp/benchmarks/common/nvbench_utilities.cpp @@ -18,25 +18,24 @@ #include -// This function is copied over from https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224. +// This function is copied over from +// https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224. void set_throughputs(nvbench::state& state) { double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value"); - if (const auto items = state.get_element_count(); items != 0) - { - auto &summ = state.add_summary("nv/cold/bw/item_rate"); + if (const auto items = state.get_element_count(); items != 0) { + auto& summ = state.add_summary("nv/cold/bw/item_rate"); summ.set_string("name", "Elem/s"); summ.set_string("hint", "item_rate"); summ.set_string("description", "Number of input elements processed per second"); summ.set_float64("value", static_cast(items) / avg_cuda_time); } - if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0) - { + if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0) { const auto avg_used_gmem_bw = static_cast(bytes) / avg_cuda_time; { - auto &summ = state.add_summary("nv/cold/bw/global/bytes_per_second"); + auto& summ = state.add_summary("nv/cold/bw/global/bytes_per_second"); summ.set_string("name", "GlobalMem BW"); summ.set_string("hint", "byte_rate"); summ.set_string("description", @@ -49,7 +48,7 @@ void set_throughputs(nvbench::state& state) const auto peak_gmem_bw = static_cast(state.get_device()->get_global_memory_bus_bandwidth()); - auto &summ = state.add_summary("nv/cold/bw/global/utilization"); + auto& summ = state.add_summary("nv/cold/bw/global/utilization"); summ.set_string("name", "BWUtil"); summ.set_string("hint", "percentage"); summ.set_string("description", @@ -57,5 +56,5 @@ void set_throughputs(nvbench::state& state) "device's peak bandwidth"); summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw); } - } // bandwidth + } } diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp index 270b0bed9ef..98d879efac5 100644 --- a/cpp/benchmarks/common/nvbench_utilities.hpp +++ b/cpp/benchmarks/common/nvbench_utilities.hpp @@ -23,7 +23,7 @@ struct state; /** * @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the * nvbench results summary. - * + * * This function could be used to work around a known issue that the throughput statistics * should be added before the nvbench::state.exec() call, otherwise they will not be printed * in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details. diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp index 20c7e7ffd74..a6fbdac9fb8 100644 --- a/cpp/benchmarks/common/table_utilities.cpp +++ b/cpp/benchmarks/common/table_utilities.cpp @@ -16,14 +16,14 @@ #include "table_utilities.hpp" -#include #include +#include #include int64_t estimate_size(cudf::column_view const& col) { - return estimate_size( cudf::table_view( {col} ) ); + return estimate_size(cudf::table_view({col})); } int64_t estimate_size(cudf::table_view const& view) @@ -34,7 +34,8 @@ int64_t estimate_size(cudf::table_view const& view) auto const agg = cudf::make_sum_aggregation(); cudf::data_type sum_dtype{cudf::type_id::INT64}; auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype); - auto const total_size_in_bits = static_cast*>(total_size_scalar.get())->value(); + auto const total_size_in_bits = + static_cast*>(total_size_scalar.get())->value(); // Convert the size in bits to the size in bytes. return static_cast(std::ceil(static_cast(total_size_in_bits) / 8)); } diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp index ad7ac5dc451..04ee847d397 100644 --- a/cpp/benchmarks/common/table_utilities.hpp +++ b/cpp/benchmarks/common/table_utilities.hpp @@ -20,22 +20,22 @@ /** * @brief Estimates the column size in bytes. - * + * * @remark As this function internally uses cudf::row_bit_count() to estimate each row size * and accumulates them, the returned estimate may be an inexact approximation in some * cases. See cudf::row_bit_count() for more details. - * + * * @param view The column view to estimate its size */ int64_t estimate_size(cudf::column_view const& view); /** * @brief Estimates the table size in bytes. - * + * * @remark As this function internally uses cudf::row_bit_count() to estimate each row size * and accumulates them, the returned estimate may be an inexact approximation in some * cases. See cudf::row_bit_count() for more details. - * + * * @param view The table view to estimate its size */ int64_t estimate_size(cudf::table_view const& view); diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp index 01591739695..5095337dbb3 100644 --- a/cpp/benchmarks/reduction/dictionary.cpp +++ b/cpp/benchmarks/reduction/dictionary.cpp @@ -56,7 +56,7 @@ void BM_reduction_dictionary(benchmark::State& state, // The benchmark takes a column and produces two scalars. set_items_processed(state, column_size + 1); - + // We don't set the metrics for the size read/written as row_bit_count() doesn't // support the dictionary type yet (and so is estimate_size()). // See https://github.com/rapidsai/cudf/issues/16121 for details. diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp index ee7f3ad0e66..de7c83aa198 100644 --- a/cpp/benchmarks/reduction/rank.cpp +++ b/cpp/benchmarks/reduction/rank.cpp @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include #include @@ -42,16 +42,18 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_listview().column(0)); int64_t result_size = 0; - state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - rmm::cuda_stream_view stream_view{launch.get_stream()}; - timer.start(); - auto result = cudf::detail::inclusive_dense_rank_scan( - input, stream_view, rmm::mr::get_current_device_resource()); - timer.stop(); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + timer.start(); + auto result = cudf::detail::inclusive_dense_rank_scan( + input, stream_view, rmm::mr::get_current_device_resource()); + timer.stop(); - // Estimating the result size will launch a kernel. Do not include it in measuring time. - result_size += estimate_size(result->view()); - }); + // Estimating the result size will launch a kernel. Do not include it in measuring + // time. + result_size += estimate_size(result->view()); + }); state.add_element_count(input.size()); state.add_global_memory_reads(estimate_size(input)); diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp index 410843ab0f2..dba58bdf7f5 100644 --- a/cpp/benchmarks/reduction/scan_structs.cpp +++ b/cpp/benchmarks/reduction/scan_structs.cpp @@ -15,8 +15,8 @@ */ #include -#include #include +#include #include #include @@ -55,15 +55,17 @@ static void nvbench_structs_scan(nvbench::state& state) state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); int64_t result_size = 0; - state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) { - timer.start(); - auto const result = cudf::detail::scan_inclusive( - input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource()); - timer.stop(); + state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, + [&](nvbench::launch& launch, auto& timer) { + timer.start(); + auto const result = cudf::detail::scan_inclusive( + input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource()); + timer.stop(); - // Estimating the result size will launch a kernel. Do not include it in measuring time. - result_size += estimate_size(result->view()); - }); + // Estimating the result size will launch a kernel. Do not include it in measuring + // time. + result_size += estimate_size(result->view()); + }); state.add_element_count(input_view.size()); state.add_global_memory_reads(estimate_size(input_view)); From 6cb33b39c18e06ec23faf5669130aeebd26c448a Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 1 Jul 2024 15:21:34 -0700 Subject: [PATCH 6/7] more explicit result size computation --- cpp/benchmarks/reduction/rank.cpp | 21 +++++++-------------- cpp/benchmarks/reduction/scan_structs.cpp | 19 ++++++------------- 2 files changed, 13 insertions(+), 27 deletions(-) diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp index de7c83aa198..14876c80d3e 100644 --- a/cpp/benchmarks/reduction/rank.cpp +++ b/cpp/benchmarks/reduction/rank.cpp @@ -41,23 +41,16 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_listview(), 2); cudf::column_view input(new_tbl->view().column(0)); - int64_t result_size = 0; - state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { - rmm::cuda_stream_view stream_view{launch.get_stream()}; - timer.start(); - auto result = cudf::detail::inclusive_dense_rank_scan( - input, stream_view, rmm::mr::get_current_device_resource()); - timer.stop(); - - // Estimating the result size will launch a kernel. Do not include it in measuring - // time. - result_size += estimate_size(result->view()); - }); + std::unique_ptr result = nullptr; + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + rmm::cuda_stream_view stream_view{launch.get_stream()}; + result = cudf::detail::inclusive_dense_rank_scan( + input, stream_view, rmm::mr::get_current_device_resource()); + }); state.add_element_count(input.size()); state.add_global_memory_reads(estimate_size(input)); - state.add_global_memory_writes(result_size); + state.add_global_memory_writes(estimate_size(result->view())); set_throughputs(state); } diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp index dba58bdf7f5..a781f75a314 100644 --- a/cpp/benchmarks/reduction/scan_structs.cpp +++ b/cpp/benchmarks/reduction/scan_structs.cpp @@ -54,22 +54,15 @@ static void nvbench_structs_scan(nvbench::state& state) auto const stream = cudf::get_default_stream(); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); - int64_t result_size = 0; - state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, - [&](nvbench::launch& launch, auto& timer) { - timer.start(); - auto const result = cudf::detail::scan_inclusive( - input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource()); - timer.stop(); - - // Estimating the result size will launch a kernel. Do not include it in measuring - // time. - result_size += estimate_size(result->view()); - }); + std::unique_ptr result = nullptr; + state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { + result = cudf::detail::scan_inclusive( + input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource()); + }); state.add_element_count(input_view.size()); state.add_global_memory_reads(estimate_size(input_view)); - state.add_global_memory_writes(result_size); + state.add_global_memory_writes(estimate_size(result->view())); set_throughputs(state); } From 8d051b530f5ac693acb40f825aaa2e0ecab6baf6 Mon Sep 17 00:00:00 2001 From: Jihoon Son Date: Mon, 1 Jul 2024 16:27:18 -0700 Subject: [PATCH 7/7] fix another result size to not accumulate --- cpp/benchmarks/reduction/scan.cpp | 14 +++++--------- 1 file changed, 5 insertions(+), 9 deletions(-) diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp index d7d47a8c9f1..dc05aad9807 100644 --- a/cpp/benchmarks/reduction/scan.cpp +++ b/cpp/benchmarks/reduction/scan.cpp @@ -36,20 +36,16 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls) auto const column = create_random_column(dtype, row_count{n_rows}); if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0); - int64_t result_size = 0; + std::unique_ptr result = nullptr; for (auto _ : state) { - std::unique_ptr result = nullptr; - { - cuda_event_timer timer(state, true); - result = cudf::scan( - *column, *cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE); - } - result_size = estimate_size(result->view()); + cuda_event_timer timer(state, true); + result = cudf::scan( + *column, *cudf::make_min_aggregation(), cudf::scan_type::INCLUSIVE); } // The benchmark takes a column and produces a new column of the same size as input. set_items_processed(state, n_rows * 2); - set_bytes_processed(state, estimate_size(column->view()) + result_size); + set_bytes_processed(state, estimate_size(column->view()) + estimate_size(result->view())); } #define SCAN_BENCHMARK_DEFINE(name, type, nulls) \