rapidsai · rapids-bot · May 11, 2021 · Feb 22, 2021 · Feb 22, 2021 · Feb 22, 2021
@@ -15,6 +15,7 @@
  */
 #pragma once
 
+#include <cudf/ast/detail/linearizer.hpp>
 #include <cudf/ast/detail/operators.hpp>
 #include <cudf/ast/linearizer.hpp>
 #include <cudf/ast/operators.hpp>
@@ -316,43 +317,110 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator,
   }
 }
 
+/**
+ * @brief The AST plan creates a device buffer of data needed to execute an AST.
+ *
+ * On construction, an AST plan creates a single "packed" host buffer of all necessary data arrays,
+ * and copies that to the device with a single host-device memory copy. Because the plan tends to be
+ * small, this is the most efficient approach for low latency.
+ *
+ * TODO: Remove comment below depending on final design
+ * The stream is not synchronized automatically, so a stream sync must be performed manually (or by
+ * another function) before the device data can be used safely.
+ *
+ */
 struct ast_plan {
  public:
-  ast_plan() : sizes(), data_pointers() {}
+  ast_plan(linearizer const& expr_linearizer,
+           rmm::cuda_stream_view stream,
+           rmm::mr::device_memory_resource* mr)
+    : _sizes{}, _data_pointers{}
+  {
+    add_to_plan(expr_linearizer.data_references());
+    add_to_plan(expr_linearizer.literals());
+    add_to_plan(expr_linearizer.operators());
+    add_to_plan(expr_linearizer.operator_source_indices());
+
+    // Create device buffer
+    auto const h_data_buffer  = host_data_buffer();
+    auto const buffer_offsets = offsets();
+    auto const buffer_size    = h_data_buffer.second;
+    auto device_data_buffer =
+      rmm::device_buffer(h_data_buffer.first.get(), buffer_size, stream, mr);
+
+    // To reduce overhead, we don't call a stream sync here.
+    // The stream is synced later when the table_device_view is created.
+    // ^^^^ this comment will be removed, we are synchronizing vvvv
+    stream.synchronize();  // this doesn't seem to work
+
+    // Create device pointers to components of plan
+    auto const device_data_buffer_ptr = static_cast<const char*>(device_data_buffer.data());
+    _device_data_references           = reinterpret_cast<const detail::device_data_reference*>(
+      device_data_buffer_ptr + buffer_offsets[0]);
+    _device_literals = reinterpret_cast<const cudf::detail::fixed_width_scalar_device_view_base*>(
+      device_data_buffer_ptr + buffer_offsets[1]);
+    _device_operators =
+      reinterpret_cast<const ast_operator*>(device_data_buffer_ptr + buffer_offsets[2]);
+    _device_operator_source_indices =
+      reinterpret_cast<const cudf::size_type*>(device_data_buffer_ptr + buffer_offsets[3]);
+  }
+
+  auto device_data_references() const { return _device_data_references; }
+  auto device_literals() const { return _device_literals; }
+  auto device_operators() const { return _device_operators; }
+  auto device_operator_source_indices() const { return _device_operator_source_indices; }
 
+ private:
   using buffer_type = std::pair<std::unique_ptr<char[]>, int>;
 
+  /**
+   * @brief Helper function for adding components (operators, literals, etc) to AST plan
+   *
+   * @tparam T  The underlying type of the input `std::vector`
+   * @param  v  The `std::vector` containing components (operators, literals, etc)
+   */
   template <typename T>
   void add_to_plan(std::vector<T> const& v)
   {
     auto const data_size = sizeof(T) * v.size();
-    sizes.push_back(data_size);
-    data_pointers.push_back(v.data());
+    _sizes.push_back(data_size);
+    _data_pointers.push_back(v.data());
   }
 
-  buffer_type get_host_data_buffer() const
+  /**
+   * @brief Create and return host buffer
+   *
+   * @return `std::pair` containing host buffer and buffer size
+   */
+  buffer_type host_data_buffer() const
   {
-    auto const total_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
+    auto const total_size = std::accumulate(_sizes.cbegin(), _sizes.cend(), 0);
     auto host_data_buffer = std::make_unique<char[]>(total_size);
-    auto const offsets    = get_offsets();
-    for (unsigned int i = 0; i < data_pointers.size(); ++i) {
-      std::memcpy(host_data_buffer.get() + offsets[i], data_pointers[i], sizes[i]);
-    }
+    auto const offset     = offsets();  // calculate once outside for loop
+    for (unsigned int i = 0; i < _data_pointers.size(); ++i)
+      std::memcpy(host_data_buffer.get() + offset[i], _data_pointers[i], _sizes[i]);
     return std::make_pair(std::move(host_data_buffer), total_size);
   }
 
-  std::vector<cudf::size_type> get_offsets() const
+  /**
+   * @brief Returns a `std::vector` of offsets into `data_pointers`
+   *
+   * @return `std::vector` of offsets into `data_pointers`
+   */
+  std::vector<cudf::size_type> offsets() const
   {
-    auto offsets = std::vector<int>(sizes.size());
+    auto offsets = std::vector<int>(_sizes.size());
     // When C++17, use std::exclusive_scan
-    offsets[0] = 0;
-    std::partial_sum(sizes.cbegin(), sizes.cend() - 1, offsets.begin() + 1);
+    std::partial_sum(_sizes.cbegin(), _sizes.cend() - 1, offsets.begin() + 1);
     return offsets;
   }
 
- private:
-  std::vector<cudf::size_type> sizes;
-  std::vector<const void*> data_pointers;
+  std::vector<cudf::size_type> _sizes;
+  std::vector<const void*> _data_pointers;
+  const detail::device_data_reference* _device_data_references;
+  const cudf::detail::fixed_width_scalar_device_view_base* _device_literals;
+  const ast_operator* _device_operators;
+  const cudf::size_type* _device_operator_source_indices;
 };
 
 /**

@@ -73,13 +73,12 @@ __launch_bounds__(max_block_size) __global__
 {
   extern __shared__ std::int64_t intermediate_storage[];
   auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * num_intermediates];
-  auto const start_idx             = cudf::size_type(threadIdx.x + blockIdx.x * blockDim.x);
-  auto const stride                = cudf::size_type(blockDim.x * gridDim.x);
-  auto const num_rows              = table.num_rows();
+  auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
+  auto const stride    = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
   auto const evaluator =
     cudf::ast::detail::row_evaluator(table, literals, thread_intermediate_storage, &output_column);
 
-  for (cudf::size_type row_index = start_idx; row_index < num_rows; row_index += stride) {
+  for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
     evaluate_row_expression(
       evaluator, data_references, operators, operator_source_indices, num_operators, row_index);
   }
@@ -90,48 +89,16 @@ std::unique_ptr<column> compute_column(table_view const table,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  // Linearize the AST
-  auto const expr_linearizer         = linearizer(expr, table);
-  auto const data_references         = expr_linearizer.data_references();
-  auto const literals                = expr_linearizer.literals();
-  auto const operators               = expr_linearizer.operators();
-  auto const num_operators           = cudf::size_type(operators.size());
-  auto const operator_source_indices = expr_linearizer.operator_source_indices();
-  auto const expr_data_type          = expr_linearizer.root_data_type();
-
-  // Create ast_plan and device buffer
-  auto plan = ast_plan();
-  plan.add_to_plan(data_references);
-  plan.add_to_plan(literals);
-  plan.add_to_plan(operators);
-  plan.add_to_plan(operator_source_indices);
-  auto const host_data_buffer = plan.get_host_data_buffer();
-  auto const buffer_offsets   = plan.get_offsets();
-  auto const buffer_size      = host_data_buffer.second;
-  auto device_data_buffer =
-    rmm::device_buffer(host_data_buffer.first.get(), buffer_size, stream, mr);
-  // To reduce overhead, we don't call a stream sync here.
-  // The stream is synced later when the table_device_view is created.
-
-  // Create device pointers to components of plan
-  auto const device_data_buffer_ptr = static_cast<const char*>(device_data_buffer.data());
-  auto const device_data_references = reinterpret_cast<const detail::device_data_reference*>(
-    device_data_buffer_ptr + buffer_offsets[0]);
-  auto const device_literals =
-    reinterpret_cast<const cudf::detail::fixed_width_scalar_device_view_base*>(
-      device_data_buffer_ptr + buffer_offsets[1]);
-  auto const device_operators =
-    reinterpret_cast<const ast_operator*>(device_data_buffer_ptr + buffer_offsets[2]);
-  auto const device_operator_source_indices =
-    reinterpret_cast<const cudf::size_type*>(device_data_buffer_ptr + buffer_offsets[3]);
+  auto const expr_linearizer = linearizer(expr, table);                // Linearize the AST
+  auto const plan            = ast_plan{expr_linearizer, stream, mr};  // Create ast_plan
 
   // Create table device view
   auto table_device         = table_device_view::create(table, stream);
   auto const table_num_rows = table.num_rows();
 
   // Prepare output column
   auto output_column = cudf::make_fixed_width_column(
-    expr_data_type, table_num_rows, mask_state::UNALLOCATED, stream, mr);
+    expr_linearizer.root_data_type(), table_num_rows, mask_state::UNALLOCATED, stream, mr);
   auto mutable_output_device =
     cudf::mutable_column_device_view::create(output_column->mutable_view(), stream);
 
@@ -150,16 +117,17 @@ std::unique_ptr<column> compute_column(table_view const table,
       : MAX_BLOCK_SIZE;
   auto const config               = cudf::detail::grid_1d{table_num_rows, block_size};
   auto const shmem_size_per_block = shmem_size_per_thread * config.num_threads_per_block;
+  auto const num_operators = static_cast<cudf::size_type>(expr_linearizer.operators().size());
 
   // Execute the kernel
   cudf::ast::detail::compute_column_kernel<MAX_BLOCK_SIZE>
     <<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
       *table_device,
-      device_literals,
+      plan.device_literals(),
       *mutable_output_device,
-      device_data_references,
-      device_operators,
-      device_operator_source_indices,
+      plan.device_data_references(),
+      plan.device_operators(),
+      plan.device_operator_source_indices(),
       num_operators,
       num_intermediates);
   CHECK_CUDA(stream.value());

@@ -18,6 +18,7 @@
 #include <cudf/ast/transform.hpp>
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
 #include <cudf/scalar/scalar_factories.hpp>
@@ -30,6 +31,8 @@
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
 
+#include <thrust/iterator/counting_iterator.h>
+
 #include <limits>
 #include <type_traits>
 
@@ -55,6 +58,22 @@ TEST_F(TransformTest, BasicAddition)
   cudf::test::expect_columns_equal(expected, result->view(), true);
 }
 
+TEST_F(TransformTest, BasicAdditionLarge)
+{
+  auto a     = thrust::make_counting_iterator(0);
+  auto col   = column_wrapper<int32_t>(a, a + 2000);
+  auto table = cudf::table_view{{col, col}};
+
+  auto col_ref    = cudf::ast::column_reference(0);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref);
+
+  auto b        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
+  auto expected = column_wrapper<int32_t>(b, b + 2000);
+  auto result   = cudf::ast::compute_column(table, expression);
+
+  cudf::test::expect_columns_equal(expected, result->view(), true);
+}
+
 TEST_F(TransformTest, LessComparator)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
@@ -71,6 +90,25 @@ TEST_F(TransformTest, LessComparator)
   cudf::test::expect_columns_equal(expected, result->view(), true);
 }
 
+TEST_F(TransformTest, LessComparatorLarge)
+{
+  auto a     = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
+  auto b     = thrust::make_counting_iterator(500);
+  auto c_0   = column_wrapper<int32_t>(a, a + 2000);
+  auto c_1   = column_wrapper<int32_t>(b, b + 2000);
+  auto table = cudf::table_view{{c_0, c_1}};
+
+  auto col_ref_0  = cudf::ast::column_reference(0);
+  auto col_ref_1  = cudf::ast::column_reference(1);
+  auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);
+
+  auto c        = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 500; });
+  auto expected = column_wrapper<bool>(c, c + 2000);
+  auto result   = cudf::ast::compute_column(table, expression);
+
+  cudf::test::expect_columns_equal(expected, result->view(), true);
+}
+
 TEST_F(TransformTest, MultiLevelTreeArithmetic)
 {
   auto c_0   = column_wrapper<int32_t>{3, 20, 1, 50};
@@ -97,6 +135,34 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic)
   cudf::test::expect_columns_equal(expected, result->view(), true);
 }
 
+TEST_F(TransformTest, MultiLevelTreeArithmeticLarge)
+{
+  using namespace cudf::ast;
+
+  auto a     = thrust::make_counting_iterator(0);
+  auto b     = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i + 1; });
+  auto c     = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
+  auto c_0   = column_wrapper<int32_t>(a, a + 2000);
+  auto c_1   = column_wrapper<int32_t>(b, b + 2000);
+  auto c_2   = column_wrapper<int32_t>(c, c + 2000);
+  auto table = cudf::table_view{{c_0, c_1, c_2}};
+
+  auto col_ref_0 = column_reference(0);
+  auto col_ref_1 = column_reference(1);
+  auto col_ref_2 = column_reference(2);
+
+  auto expr_left_subtree  = expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
+  auto expr_right_subtree = expression(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0);
+  auto expr_tree          = expression(ast_operator::SUB, expr_left_subtree, expr_right_subtree);
+
+  auto result = cudf::ast::compute_column(table, expr_tree);
+  auto calc   = [](auto i) { return (i * (i + 1)) - (i + (i * 2)); };
+  auto d      = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return calc(i); });
+  auto expected = column_wrapper<int32_t>(d, d + 2000);
+
+  cudf::test::expect_columns_equal(expected, result->view(), true);
+}
+
 TEST_F(TransformTest, ImbalancedTreeArithmetic)
 {
   auto c_0   = column_wrapper<double>{0.15, 0.37, 4.2, 21.3};