From 013e6bf01b4cd044857ac0e8f69ad481261ece90 Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Thu, 27 Jun 2024 16:04:11 -0700
Subject: [PATCH 1/7] Add throughput metrics, such as bytes_per_second for
 google benchmarks or GlobalMem BW for nvbench, for reduction benchmarks

---
 cpp/benchmarks/CMakeLists.txt                 |  6 +-
 cpp/benchmarks/common/benchmark_utilities.cpp | 25 ++++++++
 cpp/benchmarks/common/benchmark_utilities.hpp | 41 +++++++++++++
 cpp/benchmarks/common/nvbench_utilities.cpp   | 61 +++++++++++++++++++
 cpp/benchmarks/common/nvbench_utilities.hpp   | 31 ++++++++++
 cpp/benchmarks/common/table_utilities.cpp     | 41 +++++++++++++
 cpp/benchmarks/common/table_utilities.hpp     | 41 +++++++++++++
 cpp/benchmarks/reduction/anyall.cpp           |  8 ++-
 cpp/benchmarks/reduction/dictionary.cpp       |  8 +++
 cpp/benchmarks/reduction/minmax.cpp           | 13 +++-
 cpp/benchmarks/reduction/rank.cpp             | 16 ++++-
 cpp/benchmarks/reduction/reduce.cpp           |  8 ++-
 cpp/benchmarks/reduction/scan.cpp             | 19 ++++--
 cpp/benchmarks/reduction/scan_structs.cpp     | 25 ++++++--
 14 files changed, 328 insertions(+), 15 deletions(-)
 create mode 100644 cpp/benchmarks/common/benchmark_utilities.cpp
 create mode 100644 cpp/benchmarks/common/benchmark_utilities.hpp
 create mode 100644 cpp/benchmarks/common/nvbench_utilities.cpp
 create mode 100644 cpp/benchmarks/common/nvbench_utilities.hpp
 create mode 100644 cpp/benchmarks/common/table_utilities.cpp
 create mode 100644 cpp/benchmarks/common/table_utilities.hpp

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 8a48126e195..09cdeb943d2 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -41,7 +41,11 @@ target_include_directories(
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
   cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
-                               synchronization/synchronization.cpp io/cuio_common.cpp
+                               synchronization/synchronization.cpp
+                               io/cuio_common.cpp
+                               common/table_utilities.cpp
+                               common/benchmark_utilities.cpp
+                               common/nvbench_utilities.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
 add_custom_command(
diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp
new file mode 100644
index 00000000000..29336c3b0ef
--- /dev/null
+++ b/cpp/benchmarks/common/benchmark_utilities.cpp
@@ -0,0 +1,25 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "benchmark_utilities.hpp"
+
+void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration) {
+  state.SetItemsProcessed(state.iterations() * items_processed_per_iteration);
+}
+
+void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration) {
+  state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration);
+}
diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp
new file mode 100644
index 00000000000..4f0b3794022
--- /dev/null
+++ b/cpp/benchmarks/common/benchmark_utilities.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <benchmark/benchmark.h>
+
+/**
+ * @brief Sets the number of items processed during the benchmark.
+ * 
+ * This function could be used instead of ::benchmark::State.SetItemsProcessed()
+ * to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration.
+ * 
+ * @param state the benchmark state
+ * @param items_processed_per_iteration number of items processed per iteration
+ */
+void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration);
+
+/**
+ * @brief Sets the number of bytes processed during the benchmark.
+ * 
+ * This function could be used instead of ::benchmark::State.SetItemsProcessed()
+ * to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration.
+ * 
+ * @param state the benchmark state
+ * @param bytes_processed_per_iteration number of bytes processed per iteration
+ */
+void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration);
diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp
new file mode 100644
index 00000000000..ca2f7f5d698
--- /dev/null
+++ b/cpp/benchmarks/common/nvbench_utilities.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "nvbench_utilities.hpp"
+
+#include <nvbench/nvbench.cuh>
+
+// This function is copied over from https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224.
+void set_throughputs(nvbench::state& state)
+{
+  double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
+
+  if (const auto items = state.get_element_count(); items != 0)
+  {
+    auto &summ = state.add_summary("nv/cold/bw/item_rate");
+    summ.set_string("name", "Elem/s");
+    summ.set_string("hint", "item_rate");
+    summ.set_string("description", "Number of input elements processed per second");
+    summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
+  }
+
+  if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0)
+  {
+    const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
+    {
+      auto &summ = state.add_summary("nv/cold/bw/global/bytes_per_second");
+      summ.set_string("name", "GlobalMem BW");
+      summ.set_string("hint", "byte_rate");
+      summ.set_string("description",
+                      "Number of bytes read/written per second to the CUDA "
+                      "device's global memory");
+      summ.set_float64("value", avg_used_gmem_bw);
+    }
+
+    {
+      const auto peak_gmem_bw =
+        static_cast<double>(state.get_device()->get_global_memory_bus_bandwidth());
+
+      auto &summ = state.add_summary("nv/cold/bw/global/utilization");
+      summ.set_string("name", "BWUtil");
+      summ.set_string("hint", "percentage");
+      summ.set_string("description",
+                      "Global device memory utilization as a percentage of the "
+                      "device's peak bandwidth");
+      summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw);
+    }
+  } // bandwidth
+}
diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp
new file mode 100644
index 00000000000..19997b6dd4d
--- /dev/null
+++ b/cpp/benchmarks/common/nvbench_utilities.hpp
@@ -0,0 +1,31 @@
+/*
+ * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+namespace nvbench {
+struct state;
+}
+
+/**
+ * @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the
+ * nvbench results summary.
+ * 
+ * This function could be used to work around a known issue that the throughput statistics
+ * should be added before the nvbench::state.exec() call, otherwise they will not be printed
+ * in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details.
+ */
+void set_throughputs(nvbench::state& state);
diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp
new file mode 100644
index 00000000000..c35af1e0b76
--- /dev/null
+++ b/cpp/benchmarks/common/table_utilities.cpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "table_utilities.hpp"
+
+#include <cudf/transform.hpp>
+#include <cudf/reduction.hpp>
+
+int64_t estimate_size(std::unique_ptr<cudf::column> column)
+{
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.emplace_back(std::move(column));
+  cudf::table table{std::move(columns)};
+  return estimate_size(table.view());
+}
+
+int64_t estimate_size(cudf::table_view const& view)
+{
+  // Compute the size in bits for each row.
+  auto const row_sizes = cudf::row_bit_count(view);
+  // Accumulate the row sizes to compute a sum.
+  auto const agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
+  cudf::data_type sum_dtype{cudf::type_id::INT64};
+  auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype);
+  auto const total_size_in_bits = static_cast<cudf::numeric_scalar<int64_t>*>(total_size_scalar.get())->value();
+  // Convert the size in bits to the size in bytes.
+  return static_cast<int64_t>(static_cast<double>(total_size_in_bits) / 8);
+}
diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp
new file mode 100644
index 00000000000..f2c9535f411
--- /dev/null
+++ b/cpp/benchmarks/common/table_utilities.hpp
@@ -0,0 +1,41 @@
+/*
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/table/table_view.hpp>
+
+/**
+ * @brief Estimates the column size in bytes.
+ * 
+ * @remark As this function internally uses cudf::row_bit_count() to estimate each row size
+ * and accumulates them, the returned estimate may be an inexact approximation in some
+ * cases. See cudf::row_bit_count() for more details.
+ * 
+ * @param column The column to estimate its size
+ */
+int64_t estimate_size(std::unique_ptr<cudf::column> column);
+
+/**
+ * @brief Estimates the table size in bytes.
+ * 
+ * @remark As this function internally uses cudf::row_bit_count() to estimate each row size
+ * and accumulates them, the returned estimate may be an inexact approximation in some
+ * cases. See cudf::row_bit_count() for more details.
+ * 
+ * @param view The view to estimate its size
+ */
+int64_t estimate_size(cudf::table_view const& view);
diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
index 8b1e71c1585..13ddbf3605e 100644
--- a/cpp/benchmarks/reduction/anyall.cpp
+++ b/cpp/benchmarks/reduction/anyall.cpp
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -34,7 +36,7 @@ void BM_reduction_anyall(benchmark::State& state,
   auto const dtype           = cudf::type_to_id<type>();
   data_profile const profile = data_profile_builder().no_validity().distribution(
     dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100);
-  auto const values = create_random_column(dtype, row_count{column_size}, profile);
+  auto values = create_random_column(dtype, row_count{column_size}, profile);
 
   cudf::data_type output_dtype{cudf::type_id::BOOL8};
 
@@ -42,6 +44,10 @@ void BM_reduction_anyall(benchmark::State& state,
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*values, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces one scalar.
+  set_items_processed(state, column_size + 1);
+  set_bytes_processed(state, estimate_size(std::move(values)) + cudf::size_of(output_dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp
index c1c44c919ac..f715277aae7 100644
--- a/cpp/benchmarks/reduction/dictionary.cpp
+++ b/cpp/benchmarks/reduction/dictionary.cpp
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -52,6 +53,13 @@ void BM_reduction_dictionary(benchmark::State& state,
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*values, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 1);
+  
+  // We don't set the metrics for the size read/written as row_bit_count() doesn't
+  // support the dictionary type yet (and so is estimate_size()).
+  // See https://github.com/rapidsai/cudf/issues/16121 for details.
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index 963c26692e7..07efb016877 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -28,14 +30,19 @@ template <typename type>
 void BM_reduction(benchmark::State& state)
 {
   cudf::size_type const column_size{(cudf::size_type)state.range(0)};
-  auto const dtype = cudf::type_to_id<type>();
-  auto const input_column =
-    create_random_column(dtype, row_count{column_size}, data_profile_builder().no_validity());
+  auto const dtype_id = cudf::type_to_id<type>();
+  auto input_column =
+    create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity());
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
     auto result = cudf::minmax(*input_column);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 2);
+  cudf::data_type dtype = cudf::data_type{dtype_id};
+  set_bytes_processed(state, estimate_size(std::move(input_column)) + 2 * cudf::size_of(dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index e55f3b9e09f..4878bacc3f4 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -15,6 +15,8 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/detail/scan.hpp>
 #include <cudf/filling.hpp>
@@ -39,11 +41,23 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_list<typ
   auto const new_tbl = cudf::repeat(table->view(), 2);
   cudf::column_view input(new_tbl->view().column(0));
 
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+  int64_t result_size = 0;
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     rmm::cuda_stream_view stream_view{launch.get_stream()};
+    timer.start();
     auto result = cudf::detail::inclusive_dense_rank_scan(
       input, stream_view, rmm::mr::get_current_device_resource());
+    timer.stop();
+
+    // Estimating the result size will launch a kernel. Do not include it in measuring time.
+    result_size += estimate_size(std::move(result));
   });
+
+  state.add_element_count(new_tbl->num_rows());
+  state.add_global_memory_reads(estimate_size(new_tbl->view()));
+  state.add_global_memory_writes(result_size);
+
+  set_throughputs(state);
 }
 
 using data_type = nvbench::type_list<int32_t, cudf::list_view>;
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 5bd3e2e3bba..0d8f8285428 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -34,7 +36,7 @@ void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregat
   auto const dtype = cudf::type_to_id<type>();
   data_profile const profile =
     data_profile_builder().no_validity().distribution(dtype, distribution_id::UNIFORM, 0, 100);
-  auto const input_column = create_random_column(dtype, row_count{column_size}, profile);
+  auto input_column = create_random_column(dtype, row_count{column_size}, profile);
 
   cudf::data_type output_dtype =
     (agg->kind == cudf::aggregation::MEAN || agg->kind == cudf::aggregation::VARIANCE ||
@@ -46,6 +48,10 @@ void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregat
     cuda_event_timer timer(state, true);
     auto result = cudf::reduce(*input_column, *agg, output_dtype);
   }
+
+  // The benchmark takes a column and produces two scalars.
+  set_items_processed(state, column_size + 1);
+  set_bytes_processed(state, estimate_size(std::move(input_column)) + cudf::size_of(output_dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index 8c9883ece9c..6d6f93ad90b 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -14,7 +14,9 @@
  * limitations under the License.
  */
 
+#include <benchmarks/common/benchmark_utilities.hpp>
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
 
@@ -31,14 +33,23 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const dtype  = cudf::type_to_id<type>();
-  auto const column = create_random_column(dtype, row_count{n_rows});
+  auto column = create_random_column(dtype, row_count{n_rows});
   if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0);
 
+  int64_t result_size = 0;
   for (auto _ : state) {
-    cuda_event_timer timer(state, true);
-    auto result = cudf::scan(
-      *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
+    std::unique_ptr<cudf::column> result = nullptr;
+    {
+      cuda_event_timer timer(state, true);
+      result = cudf::scan(
+        *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
+    }
+    result_size = estimate_size(std::move(result));
   }
+
+  // The benchmark takes a column and produces a new column of the same size as input.
+  set_items_processed(state, n_rows * 2);
+  set_bytes_processed(state, estimate_size(std::move(column)) + result_size);
 }
 
 #define SCAN_BENCHMARK_DEFINE(name, type, nulls)                          \
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index ee97b54fbef..cee25e33ef0 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -15,6 +15,8 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/common/table_utilities.hpp>
+#include <benchmarks/common/nvbench_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/scan.hpp>
@@ -43,18 +45,33 @@ static void nvbench_structs_scan(nvbench::state& state)
     row_count{size},
     profile);
   auto [null_mask, null_count] = create_random_null_mask(size, null_probability);
-  auto const input             = cudf::make_structs_column(
+  auto input                   = cudf::make_structs_column(
     size, std::move(data_table->release()), null_count, std::move(null_mask));
+  std::vector<std::unique_ptr<cudf::column>> columns;
+  columns.emplace_back(std::move(input));
+  cudf::table input_table{std::move(columns)};
 
   auto const agg         = cudf::make_min_aggregation<cudf::scan_aggregation>();
   auto const null_policy = static_cast<cudf::null_policy>(state.get_int64("null_policy"));
   auto const stream      = cudf::get_default_stream();
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
-  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    auto const result = cudf::detail::scan_inclusive(
-      *input, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
+  int64_t result_size = 0;
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
+    timer.start();
+    auto result = cudf::detail::scan_inclusive(
+      input_table.view().column(0), *agg, null_policy, stream, rmm::mr::get_current_device_resource());
+    timer.stop();
+
+    // Estimating the result size will launch a kernel. Do not include it in measuring time.
+    result_size += estimate_size(std::move(result));
   });
+
+  state.add_element_count(input_table.num_rows());
+  state.add_global_memory_reads(estimate_size(input_table.view()));
+  state.add_global_memory_writes(result_size);
+
+  set_throughputs(state);
 }
 
 NVBENCH_BENCH(nvbench_structs_scan)

From 2daeff561b3ab6b7317fc9abe8529f9c1f4ce497 Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Fri, 28 Jun 2024 11:23:12 -0700
Subject: [PATCH 2/7] fix copyrights

---
 cpp/benchmarks/common/benchmark_utilities.cpp | 2 +-
 cpp/benchmarks/common/benchmark_utilities.hpp | 2 +-
 cpp/benchmarks/common/nvbench_utilities.cpp   | 2 +-
 cpp/benchmarks/common/nvbench_utilities.hpp   | 2 +-
 cpp/benchmarks/common/table_utilities.cpp     | 2 +-
 cpp/benchmarks/common/table_utilities.hpp     | 2 +-
 cpp/benchmarks/reduction/anyall.cpp           | 2 +-
 cpp/benchmarks/reduction/dictionary.cpp       | 2 +-
 cpp/benchmarks/reduction/minmax.cpp           | 2 +-
 cpp/benchmarks/reduction/rank.cpp             | 2 +-
 cpp/benchmarks/reduction/reduce.cpp           | 2 +-
 cpp/benchmarks/reduction/scan.cpp             | 2 +-
 cpp/benchmarks/reduction/scan_structs.cpp     | 2 +-
 13 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp
index 29336c3b0ef..38753e520f9 100644
--- a/cpp/benchmarks/common/benchmark_utilities.cpp
+++ b/cpp/benchmarks/common/benchmark_utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp
index 4f0b3794022..4c3f0d04b0e 100644
--- a/cpp/benchmarks/common/benchmark_utilities.hpp
+++ b/cpp/benchmarks/common/benchmark_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp
index ca2f7f5d698..f1541e85374 100644
--- a/cpp/benchmarks/common/nvbench_utilities.cpp
+++ b/cpp/benchmarks/common/nvbench_utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp
index 19997b6dd4d..270b0bed9ef 100644
--- a/cpp/benchmarks/common/nvbench_utilities.hpp
+++ b/cpp/benchmarks/common/nvbench_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp
index c35af1e0b76..f0e31655aa0 100644
--- a/cpp/benchmarks/common/table_utilities.cpp
+++ b/cpp/benchmarks/common/table_utilities.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp
index f2c9535f411..2e47828e8cd 100644
--- a/cpp/benchmarks/common/table_utilities.hpp
+++ b/cpp/benchmarks/common/table_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2024, NVIDIA CORPORATION.
+ * Copyright (c) 2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
index 13ddbf3605e..56311771c9c 100644
--- a/cpp/benchmarks/reduction/anyall.cpp
+++ b/cpp/benchmarks/reduction/anyall.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp
index f715277aae7..01591739695 100644
--- a/cpp/benchmarks/reduction/dictionary.cpp
+++ b/cpp/benchmarks/reduction/dictionary.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index 07efb016877..39583b0afba 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index 4878bacc3f4..fc35d43272a 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 0d8f8285428..a2d00df5371 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index 6d6f93ad90b..a4a89c27727 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index cee25e33ef0..55adfac2afc 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2022-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2022-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.

From 40804e27cd510f746e92105a23e7feb9a35d6cbe Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Fri, 28 Jun 2024 11:57:34 -0700
Subject: [PATCH 3/7] estimate_size() should take a column_view

---
 cpp/benchmarks/common/table_utilities.cpp |  7 ++-----
 cpp/benchmarks/common/table_utilities.hpp |  6 +++---
 cpp/benchmarks/reduction/anyall.cpp       |  4 ++--
 cpp/benchmarks/reduction/minmax.cpp       |  4 ++--
 cpp/benchmarks/reduction/rank.cpp         |  6 +++---
 cpp/benchmarks/reduction/reduce.cpp       |  4 ++--
 cpp/benchmarks/reduction/scan.cpp         |  6 +++---
 cpp/benchmarks/reduction/scan_structs.cpp | 16 +++++++---------
 8 files changed, 24 insertions(+), 29 deletions(-)

diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp
index f0e31655aa0..0ac90e3dbf8 100644
--- a/cpp/benchmarks/common/table_utilities.cpp
+++ b/cpp/benchmarks/common/table_utilities.cpp
@@ -19,12 +19,9 @@
 #include <cudf/transform.hpp>
 #include <cudf/reduction.hpp>
 
-int64_t estimate_size(std::unique_ptr<cudf::column> column)
+int64_t estimate_size(cudf::column_view const& col)
 {
-  std::vector<std::unique_ptr<cudf::column>> columns;
-  columns.emplace_back(std::move(column));
-  cudf::table table{std::move(columns)};
-  return estimate_size(table.view());
+  return estimate_size( cudf::table_view( {col} ) );
 }
 
 int64_t estimate_size(cudf::table_view const& view)
diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp
index 2e47828e8cd..ad7ac5dc451 100644
--- a/cpp/benchmarks/common/table_utilities.hpp
+++ b/cpp/benchmarks/common/table_utilities.hpp
@@ -25,9 +25,9 @@
  * and accumulates them, the returned estimate may be an inexact approximation in some
  * cases. See cudf::row_bit_count() for more details.
  * 
- * @param column The column to estimate its size
+ * @param view The column view to estimate its size
  */
-int64_t estimate_size(std::unique_ptr<cudf::column> column);
+int64_t estimate_size(cudf::column_view const& view);
 
 /**
  * @brief Estimates the table size in bytes.
@@ -36,6 +36,6 @@ int64_t estimate_size(std::unique_ptr<cudf::column> column);
  * and accumulates them, the returned estimate may be an inexact approximation in some
  * cases. See cudf::row_bit_count() for more details.
  * 
- * @param view The view to estimate its size
+ * @param view The table view to estimate its size
  */
 int64_t estimate_size(cudf::table_view const& view);
diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
index 56311771c9c..e9d23881764 100644
--- a/cpp/benchmarks/reduction/anyall.cpp
+++ b/cpp/benchmarks/reduction/anyall.cpp
@@ -36,7 +36,7 @@ void BM_reduction_anyall(benchmark::State& state,
   auto const dtype           = cudf::type_to_id<type>();
   data_profile const profile = data_profile_builder().no_validity().distribution(
     dtype, distribution_id::UNIFORM, 0, agg->kind == cudf::aggregation::ANY ? 0 : 100);
-  auto values = create_random_column(dtype, row_count{column_size}, profile);
+  auto const values = create_random_column(dtype, row_count{column_size}, profile);
 
   cudf::data_type output_dtype{cudf::type_id::BOOL8};
 
@@ -47,7 +47,7 @@ void BM_reduction_anyall(benchmark::State& state,
 
   // The benchmark takes a column and produces one scalar.
   set_items_processed(state, column_size + 1);
-  set_bytes_processed(state, estimate_size(std::move(values)) + cudf::size_of(output_dtype));
+  set_bytes_processed(state, estimate_size(values->view()) + cudf::size_of(output_dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/minmax.cpp b/cpp/benchmarks/reduction/minmax.cpp
index 39583b0afba..050f2887221 100644
--- a/cpp/benchmarks/reduction/minmax.cpp
+++ b/cpp/benchmarks/reduction/minmax.cpp
@@ -31,7 +31,7 @@ void BM_reduction(benchmark::State& state)
 {
   cudf::size_type const column_size{(cudf::size_type)state.range(0)};
   auto const dtype_id = cudf::type_to_id<type>();
-  auto input_column =
+  auto const input_column =
     create_random_column(dtype_id, row_count{column_size}, data_profile_builder().no_validity());
 
   for (auto _ : state) {
@@ -42,7 +42,7 @@ void BM_reduction(benchmark::State& state)
   // The benchmark takes a column and produces two scalars.
   set_items_processed(state, column_size + 2);
   cudf::data_type dtype = cudf::data_type{dtype_id};
-  set_bytes_processed(state, estimate_size(std::move(input_column)) + 2 * cudf::size_of(dtype));
+  set_bytes_processed(state, estimate_size(input_column->view()) + 2 * cudf::size_of(dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index fc35d43272a..ee7f3ad0e66 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -50,11 +50,11 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_list<typ
     timer.stop();
 
     // Estimating the result size will launch a kernel. Do not include it in measuring time.
-    result_size += estimate_size(std::move(result));
+    result_size += estimate_size(result->view());
   });
 
-  state.add_element_count(new_tbl->num_rows());
-  state.add_global_memory_reads(estimate_size(new_tbl->view()));
+  state.add_element_count(input.size());
+  state.add_global_memory_reads(estimate_size(input));
   state.add_global_memory_writes(result_size);
 
   set_throughputs(state);
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index a2d00df5371..63c96f4fe9e 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -36,7 +36,7 @@ void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregat
   auto const dtype = cudf::type_to_id<type>();
   data_profile const profile =
     data_profile_builder().no_validity().distribution(dtype, distribution_id::UNIFORM, 0, 100);
-  auto input_column = create_random_column(dtype, row_count{column_size}, profile);
+  auto const input_column = create_random_column(dtype, row_count{column_size}, profile);
 
   cudf::data_type output_dtype =
     (agg->kind == cudf::aggregation::MEAN || agg->kind == cudf::aggregation::VARIANCE ||
@@ -51,7 +51,7 @@ void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregat
 
   // The benchmark takes a column and produces two scalars.
   set_items_processed(state, column_size + 1);
-  set_bytes_processed(state, estimate_size(std::move(input_column)) + cudf::size_of(output_dtype));
+  set_bytes_processed(state, estimate_size(input_column->view()) + cudf::size_of(output_dtype));
 }
 
 #define concat(a, b, c) a##b##c
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index a4a89c27727..d7d47a8c9f1 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -33,7 +33,7 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
 {
   cudf::size_type const n_rows{(cudf::size_type)state.range(0)};
   auto const dtype  = cudf::type_to_id<type>();
-  auto column = create_random_column(dtype, row_count{n_rows});
+  auto const column = create_random_column(dtype, row_count{n_rows});
   if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0);
 
   int64_t result_size = 0;
@@ -44,12 +44,12 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
       result = cudf::scan(
         *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
     }
-    result_size = estimate_size(std::move(result));
+    result_size = estimate_size(result->view());
   }
 
   // The benchmark takes a column and produces a new column of the same size as input.
   set_items_processed(state, n_rows * 2);
-  set_bytes_processed(state, estimate_size(std::move(column)) + result_size);
+  set_bytes_processed(state, estimate_size(column->view()) + result_size);
 }
 
 #define SCAN_BENCHMARK_DEFINE(name, type, nulls)                          \
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index 55adfac2afc..410843ab0f2 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -45,11 +45,9 @@ static void nvbench_structs_scan(nvbench::state& state)
     row_count{size},
     profile);
   auto [null_mask, null_count] = create_random_null_mask(size, null_probability);
-  auto input                   = cudf::make_structs_column(
+  auto const input             = cudf::make_structs_column(
     size, std::move(data_table->release()), null_count, std::move(null_mask));
-  std::vector<std::unique_ptr<cudf::column>> columns;
-  columns.emplace_back(std::move(input));
-  cudf::table input_table{std::move(columns)};
+  auto input_view = input->view();
 
   auto const agg         = cudf::make_min_aggregation<cudf::scan_aggregation>();
   auto const null_policy = static_cast<cudf::null_policy>(state.get_int64("null_policy"));
@@ -59,16 +57,16 @@ static void nvbench_structs_scan(nvbench::state& state)
   int64_t result_size = 0;
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
     timer.start();
-    auto result = cudf::detail::scan_inclusive(
-      input_table.view().column(0), *agg, null_policy, stream, rmm::mr::get_current_device_resource());
+    auto const result = cudf::detail::scan_inclusive(
+      input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
     timer.stop();
 
     // Estimating the result size will launch a kernel. Do not include it in measuring time.
-    result_size += estimate_size(std::move(result));
+    result_size += estimate_size(result->view());
   });
 
-  state.add_element_count(input_table.num_rows());
-  state.add_global_memory_reads(estimate_size(input_table.view()));
+  state.add_element_count(input_view.size());
+  state.add_global_memory_reads(estimate_size(input_view));
   state.add_global_memory_writes(result_size);
 
   set_throughputs(state);

From 951795e4e376f5a10ceb83f095294f0b8c605f18 Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Fri, 28 Jun 2024 12:02:46 -0700
Subject: [PATCH 4/7] add missing roundup

---
 cpp/benchmarks/common/table_utilities.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp
index 0ac90e3dbf8..20c7e7ffd74 100644
--- a/cpp/benchmarks/common/table_utilities.cpp
+++ b/cpp/benchmarks/common/table_utilities.cpp
@@ -19,6 +19,8 @@
 #include <cudf/transform.hpp>
 #include <cudf/reduction.hpp>
 
+#include <cmath>
+
 int64_t estimate_size(cudf::column_view const& col)
 {
   return estimate_size( cudf::table_view( {col} ) );
@@ -34,5 +36,5 @@ int64_t estimate_size(cudf::table_view const& view)
   auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype);
   auto const total_size_in_bits = static_cast<cudf::numeric_scalar<int64_t>*>(total_size_scalar.get())->value();
   // Convert the size in bits to the size in bytes.
-  return static_cast<int64_t>(static_cast<double>(total_size_in_bits) / 8);
+  return static_cast<int64_t>(std::ceil(static_cast<double>(total_size_in_bits) / 8));
 }

From d844bf14179db6fdc28f9e28a50cb63909823af5 Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Fri, 28 Jun 2024 13:02:19 -0700
Subject: [PATCH 5/7] style fix

---
 cpp/benchmarks/CMakeLists.txt                 | 13 ++++++-----
 cpp/benchmarks/common/benchmark_utilities.cpp |  6 +++--
 cpp/benchmarks/common/benchmark_utilities.hpp |  8 +++----
 cpp/benchmarks/common/nvbench_utilities.cpp   | 17 +++++++-------
 cpp/benchmarks/common/nvbench_utilities.hpp   |  2 +-
 cpp/benchmarks/common/table_utilities.cpp     |  7 +++---
 cpp/benchmarks/common/table_utilities.hpp     |  8 +++----
 cpp/benchmarks/reduction/dictionary.cpp       |  2 +-
 cpp/benchmarks/reduction/rank.cpp             | 22 ++++++++++---------
 cpp/benchmarks/reduction/scan_structs.cpp     | 20 +++++++++--------
 10 files changed, 56 insertions(+), 49 deletions(-)

diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index 09cdeb943d2..a5b248135c1 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -40,12 +40,13 @@ target_include_directories(
 
 # Use an OBJECT library so we only compile these helper source files only once
 add_library(
-  cudf_benchmark_common OBJECT "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
-                               synchronization/synchronization.cpp
-                               io/cuio_common.cpp
-                               common/table_utilities.cpp
-                               common/benchmark_utilities.cpp
-                               common/nvbench_utilities.cpp
+  cudf_benchmark_common OBJECT
+  "${CUDF_SOURCE_DIR}/tests/utilities/random_seed.cpp"
+  synchronization/synchronization.cpp
+  io/cuio_common.cpp
+  common/table_utilities.cpp
+  common/benchmark_utilities.cpp
+  common/nvbench_utilities.cpp
 )
 target_link_libraries(cudf_benchmark_common PRIVATE cudf_datagen $<TARGET_NAME_IF_EXISTS:conda_env>)
 add_custom_command(
diff --git a/cpp/benchmarks/common/benchmark_utilities.cpp b/cpp/benchmarks/common/benchmark_utilities.cpp
index 38753e520f9..0b9fc17e779 100644
--- a/cpp/benchmarks/common/benchmark_utilities.cpp
+++ b/cpp/benchmarks/common/benchmark_utilities.cpp
@@ -16,10 +16,12 @@
 
 #include "benchmark_utilities.hpp"
 
-void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration) {
+void set_items_processed(::benchmark::State& state, int64_t items_processed_per_iteration)
+{
   state.SetItemsProcessed(state.iterations() * items_processed_per_iteration);
 }
 
-void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration) {
+void set_bytes_processed(::benchmark::State& state, int64_t bytes_processed_per_iteration)
+{
   state.SetBytesProcessed(state.iterations() * bytes_processed_per_iteration);
 }
diff --git a/cpp/benchmarks/common/benchmark_utilities.hpp b/cpp/benchmarks/common/benchmark_utilities.hpp
index 4c3f0d04b0e..c5c80e73674 100644
--- a/cpp/benchmarks/common/benchmark_utilities.hpp
+++ b/cpp/benchmarks/common/benchmark_utilities.hpp
@@ -20,10 +20,10 @@
 
 /**
  * @brief Sets the number of items processed during the benchmark.
- * 
+ *
  * This function could be used instead of ::benchmark::State.SetItemsProcessed()
  * to avoid repeatedly computing ::benchmark::State.iterations() * items_processed_per_iteration.
- * 
+ *
  * @param state the benchmark state
  * @param items_processed_per_iteration number of items processed per iteration
  */
@@ -31,10 +31,10 @@ void set_items_processed(::benchmark::State& state, int64_t items_processed_per_
 
 /**
  * @brief Sets the number of bytes processed during the benchmark.
- * 
+ *
  * This function could be used instead of ::benchmark::State.SetItemsProcessed()
  * to avoid repeatedly computing ::benchmark::State.iterations() * bytes_processed_per_iteration.
- * 
+ *
  * @param state the benchmark state
  * @param bytes_processed_per_iteration number of bytes processed per iteration
  */
diff --git a/cpp/benchmarks/common/nvbench_utilities.cpp b/cpp/benchmarks/common/nvbench_utilities.cpp
index f1541e85374..c740eaa52f4 100644
--- a/cpp/benchmarks/common/nvbench_utilities.cpp
+++ b/cpp/benchmarks/common/nvbench_utilities.cpp
@@ -18,25 +18,24 @@
 
 #include <nvbench/nvbench.cuh>
 
-// This function is copied over from https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224.
+// This function is copied over from
+// https://github.com/NVIDIA/nvbench/blob/a171514056e5d6a7f52a035dd6c812fa301d4f4f/nvbench/detail/measure_cold.cu#L190-L224.
 void set_throughputs(nvbench::state& state)
 {
   double avg_cuda_time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
 
-  if (const auto items = state.get_element_count(); items != 0)
-  {
-    auto &summ = state.add_summary("nv/cold/bw/item_rate");
+  if (const auto items = state.get_element_count(); items != 0) {
+    auto& summ = state.add_summary("nv/cold/bw/item_rate");
     summ.set_string("name", "Elem/s");
     summ.set_string("hint", "item_rate");
     summ.set_string("description", "Number of input elements processed per second");
     summ.set_float64("value", static_cast<double>(items) / avg_cuda_time);
   }
 
-  if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0)
-  {
+  if (const auto bytes = state.get_global_memory_rw_bytes(); bytes != 0) {
     const auto avg_used_gmem_bw = static_cast<double>(bytes) / avg_cuda_time;
     {
-      auto &summ = state.add_summary("nv/cold/bw/global/bytes_per_second");
+      auto& summ = state.add_summary("nv/cold/bw/global/bytes_per_second");
       summ.set_string("name", "GlobalMem BW");
       summ.set_string("hint", "byte_rate");
       summ.set_string("description",
@@ -49,7 +48,7 @@ void set_throughputs(nvbench::state& state)
       const auto peak_gmem_bw =
         static_cast<double>(state.get_device()->get_global_memory_bus_bandwidth());
 
-      auto &summ = state.add_summary("nv/cold/bw/global/utilization");
+      auto& summ = state.add_summary("nv/cold/bw/global/utilization");
       summ.set_string("name", "BWUtil");
       summ.set_string("hint", "percentage");
       summ.set_string("description",
@@ -57,5 +56,5 @@ void set_throughputs(nvbench::state& state)
                       "device's peak bandwidth");
       summ.set_float64("value", avg_used_gmem_bw / peak_gmem_bw);
     }
-  } // bandwidth
+  }
 }
diff --git a/cpp/benchmarks/common/nvbench_utilities.hpp b/cpp/benchmarks/common/nvbench_utilities.hpp
index 270b0bed9ef..98d879efac5 100644
--- a/cpp/benchmarks/common/nvbench_utilities.hpp
+++ b/cpp/benchmarks/common/nvbench_utilities.hpp
@@ -23,7 +23,7 @@ struct state;
 /**
  * @brief Sets throughput statistics, such as "Elem/s", "GlobalMem BW", and "BWUtil" for the
  * nvbench results summary.
- * 
+ *
  * This function could be used to work around a known issue that the throughput statistics
  * should be added before the nvbench::state.exec() call, otherwise they will not be printed
  * in the summary. See https://github.com/NVIDIA/nvbench/issues/175 for more details.
diff --git a/cpp/benchmarks/common/table_utilities.cpp b/cpp/benchmarks/common/table_utilities.cpp
index 20c7e7ffd74..a6fbdac9fb8 100644
--- a/cpp/benchmarks/common/table_utilities.cpp
+++ b/cpp/benchmarks/common/table_utilities.cpp
@@ -16,14 +16,14 @@
 
 #include "table_utilities.hpp"
 
-#include <cudf/transform.hpp>
 #include <cudf/reduction.hpp>
+#include <cudf/transform.hpp>
 
 #include <cmath>
 
 int64_t estimate_size(cudf::column_view const& col)
 {
-  return estimate_size( cudf::table_view( {col} ) );
+  return estimate_size(cudf::table_view({col}));
 }
 
 int64_t estimate_size(cudf::table_view const& view)
@@ -34,7 +34,8 @@ int64_t estimate_size(cudf::table_view const& view)
   auto const agg = cudf::make_sum_aggregation<cudf::reduce_aggregation>();
   cudf::data_type sum_dtype{cudf::type_id::INT64};
   auto const total_size_scalar = cudf::reduce(*row_sizes, *agg, sum_dtype);
-  auto const total_size_in_bits = static_cast<cudf::numeric_scalar<int64_t>*>(total_size_scalar.get())->value();
+  auto const total_size_in_bits =
+    static_cast<cudf::numeric_scalar<int64_t>*>(total_size_scalar.get())->value();
   // Convert the size in bits to the size in bytes.
   return static_cast<int64_t>(std::ceil(static_cast<double>(total_size_in_bits) / 8));
 }
diff --git a/cpp/benchmarks/common/table_utilities.hpp b/cpp/benchmarks/common/table_utilities.hpp
index ad7ac5dc451..04ee847d397 100644
--- a/cpp/benchmarks/common/table_utilities.hpp
+++ b/cpp/benchmarks/common/table_utilities.hpp
@@ -20,22 +20,22 @@
 
 /**
  * @brief Estimates the column size in bytes.
- * 
+ *
  * @remark As this function internally uses cudf::row_bit_count() to estimate each row size
  * and accumulates them, the returned estimate may be an inexact approximation in some
  * cases. See cudf::row_bit_count() for more details.
- * 
+ *
  * @param view The column view to estimate its size
  */
 int64_t estimate_size(cudf::column_view const& view);
 
 /**
  * @brief Estimates the table size in bytes.
- * 
+ *
  * @remark As this function internally uses cudf::row_bit_count() to estimate each row size
  * and accumulates them, the returned estimate may be an inexact approximation in some
  * cases. See cudf::row_bit_count() for more details.
- * 
+ *
  * @param view The table view to estimate its size
  */
 int64_t estimate_size(cudf::table_view const& view);
diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp
index 01591739695..5095337dbb3 100644
--- a/cpp/benchmarks/reduction/dictionary.cpp
+++ b/cpp/benchmarks/reduction/dictionary.cpp
@@ -56,7 +56,7 @@ void BM_reduction_dictionary(benchmark::State& state,
 
   // The benchmark takes a column and produces two scalars.
   set_items_processed(state, column_size + 1);
-  
+
   // We don't set the metrics for the size read/written as row_bit_count() doesn't
   // support the dictionary type yet (and so is estimate_size()).
   // See https://github.com/rapidsai/cudf/issues/16121 for details.
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index ee7f3ad0e66..de7c83aa198 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -15,8 +15,8 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 
 #include <cudf/detail/scan.hpp>
 #include <cudf/filling.hpp>
@@ -42,16 +42,18 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_list<typ
   cudf::column_view input(new_tbl->view().column(0));
 
   int64_t result_size = 0;
-  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-    rmm::cuda_stream_view stream_view{launch.get_stream()};
-    timer.start();
-    auto result = cudf::detail::inclusive_dense_rank_scan(
-      input, stream_view, rmm::mr::get_current_device_resource());
-    timer.stop();
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               rmm::cuda_stream_view stream_view{launch.get_stream()};
+               timer.start();
+               auto result = cudf::detail::inclusive_dense_rank_scan(
+                 input, stream_view, rmm::mr::get_current_device_resource());
+               timer.stop();
 
-    // Estimating the result size will launch a kernel. Do not include it in measuring time.
-    result_size += estimate_size(result->view());
-  });
+               // Estimating the result size will launch a kernel. Do not include it in measuring
+               // time.
+               result_size += estimate_size(result->view());
+             });
 
   state.add_element_count(input.size());
   state.add_global_memory_reads(estimate_size(input));
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index 410843ab0f2..dba58bdf7f5 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -15,8 +15,8 @@
  */
 
 #include <benchmarks/common/generate_input.hpp>
-#include <benchmarks/common/table_utilities.hpp>
 #include <benchmarks/common/nvbench_utilities.hpp>
+#include <benchmarks/common/table_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/scan.hpp>
@@ -55,15 +55,17 @@ static void nvbench_structs_scan(nvbench::state& state)
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
   int64_t result_size = 0;
-  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
-    timer.start();
-    auto const result = cudf::detail::scan_inclusive(
-      input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
-    timer.stop();
+  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
+             [&](nvbench::launch& launch, auto& timer) {
+               timer.start();
+               auto const result = cudf::detail::scan_inclusive(
+                 input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
+               timer.stop();
 
-    // Estimating the result size will launch a kernel. Do not include it in measuring time.
-    result_size += estimate_size(result->view());
-  });
+               // Estimating the result size will launch a kernel. Do not include it in measuring
+               // time.
+               result_size += estimate_size(result->view());
+             });
 
   state.add_element_count(input_view.size());
   state.add_global_memory_reads(estimate_size(input_view));

From 6cb33b39c18e06ec23faf5669130aeebd26c448a Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Mon, 1 Jul 2024 15:21:34 -0700
Subject: [PATCH 6/7] more explicit result size computation

---
 cpp/benchmarks/reduction/rank.cpp         | 21 +++++++--------------
 cpp/benchmarks/reduction/scan_structs.cpp | 19 ++++++-------------
 2 files changed, 13 insertions(+), 27 deletions(-)

diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index de7c83aa198..14876c80d3e 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -41,23 +41,16 @@ static void nvbench_reduction_scan(nvbench::state& state, nvbench::type_list<typ
   auto const new_tbl = cudf::repeat(table->view(), 2);
   cudf::column_view input(new_tbl->view().column(0));
 
-  int64_t result_size = 0;
-  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-             [&](nvbench::launch& launch, auto& timer) {
-               rmm::cuda_stream_view stream_view{launch.get_stream()};
-               timer.start();
-               auto result = cudf::detail::inclusive_dense_rank_scan(
-                 input, stream_view, rmm::mr::get_current_device_resource());
-               timer.stop();
-
-               // Estimating the result size will launch a kernel. Do not include it in measuring
-               // time.
-               result_size += estimate_size(result->view());
-             });
+  std::unique_ptr<cudf::column> result = nullptr;
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    rmm::cuda_stream_view stream_view{launch.get_stream()};
+    result = cudf::detail::inclusive_dense_rank_scan(
+      input, stream_view, rmm::mr::get_current_device_resource());
+  });
 
   state.add_element_count(input.size());
   state.add_global_memory_reads(estimate_size(input));
-  state.add_global_memory_writes(result_size);
+  state.add_global_memory_writes(estimate_size(result->view()));
 
   set_throughputs(state);
 }
diff --git a/cpp/benchmarks/reduction/scan_structs.cpp b/cpp/benchmarks/reduction/scan_structs.cpp
index dba58bdf7f5..a781f75a314 100644
--- a/cpp/benchmarks/reduction/scan_structs.cpp
+++ b/cpp/benchmarks/reduction/scan_structs.cpp
@@ -54,22 +54,15 @@ static void nvbench_structs_scan(nvbench::state& state)
   auto const stream      = cudf::get_default_stream();
 
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
-  int64_t result_size = 0;
-  state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
-             [&](nvbench::launch& launch, auto& timer) {
-               timer.start();
-               auto const result = cudf::detail::scan_inclusive(
-                 input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
-               timer.stop();
-
-               // Estimating the result size will launch a kernel. Do not include it in measuring
-               // time.
-               result_size += estimate_size(result->view());
-             });
+  std::unique_ptr<cudf::column> result = nullptr;
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    result = cudf::detail::scan_inclusive(
+      input_view, *agg, null_policy, stream, rmm::mr::get_current_device_resource());
+  });
 
   state.add_element_count(input_view.size());
   state.add_global_memory_reads(estimate_size(input_view));
-  state.add_global_memory_writes(result_size);
+  state.add_global_memory_writes(estimate_size(result->view()));
 
   set_throughputs(state);
 }

From 8d051b530f5ac693acb40f825aaa2e0ecab6baf6 Mon Sep 17 00:00:00 2001
From: Jihoon Son <ghoonson@gmail.com>
Date: Mon, 1 Jul 2024 16:27:18 -0700
Subject: [PATCH 7/7] fix another result size to not accumulate

---
 cpp/benchmarks/reduction/scan.cpp | 14 +++++---------
 1 file changed, 5 insertions(+), 9 deletions(-)

diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index d7d47a8c9f1..dc05aad9807 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -36,20 +36,16 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
   auto const column = create_random_column(dtype, row_count{n_rows});
   if (!include_nulls) column->set_null_mask(rmm::device_buffer{}, 0);
 
-  int64_t result_size = 0;
+  std::unique_ptr<cudf::column> result = nullptr;
   for (auto _ : state) {
-    std::unique_ptr<cudf::column> result = nullptr;
-    {
-      cuda_event_timer timer(state, true);
-      result = cudf::scan(
-        *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
-    }
-    result_size = estimate_size(result->view());
+    cuda_event_timer timer(state, true);
+    result = cudf::scan(
+      *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
   }
 
   // The benchmark takes a column and produces a new column of the same size as input.
   set_items_processed(state, n_rows * 2);
-  set_bytes_processed(state, estimate_size(column->view()) + result_size);
+  set_bytes_processed(state, estimate_size(column->view()) + estimate_size(result->view()));
 }
 
 #define SCAN_BENCHMARK_DEFINE(name, type, nulls)                          \