Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Abstract Syntax Tree Cleanup and Tests #7418

Merged
merged 22 commits into from
May 11, 2021
Merged
Show file tree
Hide file tree
Changes from 5 commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
c2cec05
Large unit tests
codereport Feb 22, 2021
1eae304
Move AST plan initialization to ctor
codereport Feb 22, 2021
5af4c59
AST Plan docs + more cleanup
codereport Feb 22, 2021
6ce1add
Calculate offsets once
codereport Feb 23, 2021
ae896e5
Move more work to AST ctor, currently failing
codereport Feb 24, 2021
393cdb2
temp
codereport Mar 2, 2021
1697eb3
Merge branch 'branch-0.20' into ast-tests-cleanup
codereport Apr 8, 2021
33ce7cf
Merge remote-tracking branch 'origin/branch-0.20' into ast-tests-cleanup
vyasr Apr 17, 2021
c9315c5
Keep device buffer from going out of scope before compute.
vyasr Apr 17, 2021
314358b
Clean up comments.
vyasr Apr 19, 2021
9c004d0
Inline offset and host buffer creation.
vyasr Apr 19, 2021
2534773
Remove preprocessor elided code.
vyasr Apr 19, 2021
97e84f2
Merge remote-tracking branch 'origin/branch-0.20' into ast-tests-cleanup
vyasr Apr 21, 2021
f924671
Remove unnecessary forward declarations and friend relationships betw…
vyasr Apr 21, 2021
459eae1
Add more informative comment for why r-value expression constructors …
vyasr Apr 21, 2021
33bf98d
Move linearizer.hpp to nodes.hpp.
vyasr Apr 21, 2021
8781a3b
Make ast plan internals public and use them directly.
vyasr Apr 21, 2021
4ed23dd
Apply rename in meta.yaml.
vyasr Apr 21, 2021
8931591
Merge branch 'branch-0.20' into ast-tests-cleanup
vyasr May 3, 2021
f0a5c66
Change raw pointers in ast_plan to span.
vyasr May 3, 2021
d2a5e8b
Pass the spans through the code.
vyasr May 3, 2021
98b219a
Address PR comments.
vyasr May 6, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
100 changes: 84 additions & 16 deletions cpp/include/cudf/ast/detail/transform.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
*/
#pragma once

#include <cudf/ast/detail/linearizer.hpp>
#include <cudf/ast/detail/operators.hpp>
#include <cudf/ast/linearizer.hpp>
#include <cudf/ast/operators.hpp>
Expand Down Expand Up @@ -316,43 +317,110 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator,
}
}

/**
* @brief The AST plan creates a device buffer of data needed to execute an AST.
*
* On construction, an AST plan creates a single "packed" host buffer of all necessary data arrays,
* and copies that to the device with a single host-device memory copy. Because the plan tends to be
* small, this is the most efficient approach for low latency.
*
* TODO: Remove comment below depending on final design
* The stream is not synchronized automatically, so a stream sync must be performed manually (or by
* another function) before the device data can be used safely.
*
*/
struct ast_plan {
public:
ast_plan() : sizes(), data_pointers() {}
ast_plan(linearizer const& expr_linearizer,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
: _sizes{}, _data_pointers{}
{
add_to_plan(expr_linearizer.data_references());
add_to_plan(expr_linearizer.literals());
add_to_plan(expr_linearizer.operators());
add_to_plan(expr_linearizer.operator_source_indices());

// Create device buffer
auto const h_data_buffer = host_data_buffer();
auto const buffer_offsets = offsets();
auto const buffer_size = h_data_buffer.second;
auto device_data_buffer =
rmm::device_buffer(h_data_buffer.first.get(), buffer_size, stream, mr);

// To reduce overhead, we don't call a stream sync here.
// The stream is synced later when the table_device_view is created.
// ^^^^ this comment will be removed, we are synchronizing vvvv
stream.synchronize(); // this doesn't seem to work

// Create device pointers to components of plan
auto const device_data_buffer_ptr = static_cast<const char*>(device_data_buffer.data());
_device_data_references = reinterpret_cast<const detail::device_data_reference*>(
device_data_buffer_ptr + buffer_offsets[0]);
_device_literals = reinterpret_cast<const cudf::detail::fixed_width_scalar_device_view_base*>(
device_data_buffer_ptr + buffer_offsets[1]);
_device_operators =
reinterpret_cast<const ast_operator*>(device_data_buffer_ptr + buffer_offsets[2]);
_device_operator_source_indices =
reinterpret_cast<const cudf::size_type*>(device_data_buffer_ptr + buffer_offsets[3]);
}

auto device_data_references() const { return _device_data_references; }
vyasr marked this conversation as resolved.
Show resolved Hide resolved
auto device_literals() const { return _device_literals; }
auto device_operators() const { return _device_operators; }
auto device_operator_source_indices() const { return _device_operator_source_indices; }

private:
using buffer_type = std::pair<std::unique_ptr<char[]>, int>;
vyasr marked this conversation as resolved.
Show resolved Hide resolved

/**
* @brief Helper function for adding components (operators, literals, etc) to AST plan
*
* @tparam T The underlying type of the input `std::vector`
* @param v The `std::vector` containing components (operators, literals, etc)
*/
template <typename T>
void add_to_plan(std::vector<T> const& v)
{
auto const data_size = sizeof(T) * v.size();
sizes.push_back(data_size);
data_pointers.push_back(v.data());
_sizes.push_back(data_size);
_data_pointers.push_back(v.data());
}

buffer_type get_host_data_buffer() const
/**
* @brief Create and return host buffer
*
* @return `std::pair` containing host buffer and buffer size
*/
buffer_type host_data_buffer() const
{
auto const total_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0);
auto const total_size = std::accumulate(_sizes.cbegin(), _sizes.cend(), 0);
auto host_data_buffer = std::make_unique<char[]>(total_size);
auto const offsets = get_offsets();
for (unsigned int i = 0; i < data_pointers.size(); ++i) {
std::memcpy(host_data_buffer.get() + offsets[i], data_pointers[i], sizes[i]);
}
auto const offset = offsets(); // calculate once outside for loop
for (unsigned int i = 0; i < _data_pointers.size(); ++i)
std::memcpy(host_data_buffer.get() + offset[i], _data_pointers[i], _sizes[i]);
return std::make_pair(std::move(host_data_buffer), total_size);
}

std::vector<cudf::size_type> get_offsets() const
/**
* @brief Returns a `std::vector` of offsets into `data_pointers`
*
* @return `std::vector` of offsets into `data_pointers`
*/
std::vector<cudf::size_type> offsets() const
{
auto offsets = std::vector<int>(sizes.size());
auto offsets = std::vector<int>(_sizes.size());
// When C++17, use std::exclusive_scan
offsets[0] = 0;
std::partial_sum(sizes.cbegin(), sizes.cend() - 1, offsets.begin() + 1);
std::partial_sum(_sizes.cbegin(), _sizes.cend() - 1, offsets.begin() + 1);
return offsets;
}

private:
std::vector<cudf::size_type> sizes;
std::vector<const void*> data_pointers;
std::vector<cudf::size_type> _sizes;
std::vector<const void*> _data_pointers;
const detail::device_data_reference* _device_data_references;
const cudf::detail::fixed_width_scalar_device_view_base* _device_literals;
const ast_operator* _device_operators;
const cudf::size_type* _device_operator_source_indices;
vyasr marked this conversation as resolved.
Show resolved Hide resolved
};

/**
Expand Down
54 changes: 11 additions & 43 deletions cpp/src/ast/transform.cu
Original file line number Diff line number Diff line change
Expand Up @@ -73,13 +73,12 @@ __launch_bounds__(max_block_size) __global__
{
extern __shared__ std::int64_t intermediate_storage[];
auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * num_intermediates];
auto const start_idx = cudf::size_type(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = cudf::size_type(blockDim.x * gridDim.x);
auto const num_rows = table.num_rows();
auto const start_idx = static_cast<cudf::size_type>(threadIdx.x + blockIdx.x * blockDim.x);
auto const stride = static_cast<cudf::size_type>(blockDim.x * gridDim.x);
auto const evaluator =
cudf::ast::detail::row_evaluator(table, literals, thread_intermediate_storage, &output_column);

for (cudf::size_type row_index = start_idx; row_index < num_rows; row_index += stride) {
for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) {
evaluate_row_expression(
evaluator, data_references, operators, operator_source_indices, num_operators, row_index);
}
Expand All @@ -90,48 +89,16 @@ std::unique_ptr<column> compute_column(table_view const table,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Linearize the AST
auto const expr_linearizer = linearizer(expr, table);
auto const data_references = expr_linearizer.data_references();
auto const literals = expr_linearizer.literals();
auto const operators = expr_linearizer.operators();
auto const num_operators = cudf::size_type(operators.size());
auto const operator_source_indices = expr_linearizer.operator_source_indices();
auto const expr_data_type = expr_linearizer.root_data_type();

// Create ast_plan and device buffer
auto plan = ast_plan();
plan.add_to_plan(data_references);
plan.add_to_plan(literals);
plan.add_to_plan(operators);
plan.add_to_plan(operator_source_indices);
auto const host_data_buffer = plan.get_host_data_buffer();
auto const buffer_offsets = plan.get_offsets();
auto const buffer_size = host_data_buffer.second;
auto device_data_buffer =
rmm::device_buffer(host_data_buffer.first.get(), buffer_size, stream, mr);
// To reduce overhead, we don't call a stream sync here.
// The stream is synced later when the table_device_view is created.

// Create device pointers to components of plan
auto const device_data_buffer_ptr = static_cast<const char*>(device_data_buffer.data());
auto const device_data_references = reinterpret_cast<const detail::device_data_reference*>(
device_data_buffer_ptr + buffer_offsets[0]);
auto const device_literals =
reinterpret_cast<const cudf::detail::fixed_width_scalar_device_view_base*>(
device_data_buffer_ptr + buffer_offsets[1]);
auto const device_operators =
reinterpret_cast<const ast_operator*>(device_data_buffer_ptr + buffer_offsets[2]);
auto const device_operator_source_indices =
reinterpret_cast<const cudf::size_type*>(device_data_buffer_ptr + buffer_offsets[3]);
auto const expr_linearizer = linearizer(expr, table); // Linearize the AST
auto const plan = ast_plan{expr_linearizer, stream, mr}; // Create ast_plan

// Create table device view
auto table_device = table_device_view::create(table, stream);
auto const table_num_rows = table.num_rows();

// Prepare output column
auto output_column = cudf::make_fixed_width_column(
expr_data_type, table_num_rows, mask_state::UNALLOCATED, stream, mr);
expr_linearizer.root_data_type(), table_num_rows, mask_state::UNALLOCATED, stream, mr);
auto mutable_output_device =
cudf::mutable_column_device_view::create(output_column->mutable_view(), stream);

Expand All @@ -150,16 +117,17 @@ std::unique_ptr<column> compute_column(table_view const table,
: MAX_BLOCK_SIZE;
auto const config = cudf::detail::grid_1d{table_num_rows, block_size};
auto const shmem_size_per_block = shmem_size_per_thread * config.num_threads_per_block;
auto const num_operators = static_cast<cudf::size_type>(expr_linearizer.operators().size());

// Execute the kernel
cudf::ast::detail::compute_column_kernel<MAX_BLOCK_SIZE>
<<<config.num_blocks, config.num_threads_per_block, shmem_size_per_block, stream.value()>>>(
*table_device,
device_literals,
plan.device_literals(),
*mutable_output_device,
device_data_references,
device_operators,
device_operator_source_indices,
plan.device_data_references(),
plan.device_operators(),
plan.device_operator_source_indices(),
num_operators,
num_intermediates);
CHECK_CUDA(stream.value());
Expand Down
66 changes: 66 additions & 0 deletions cpp/tests/ast/transform_tests.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -18,6 +18,7 @@
#include <cudf/ast/transform.hpp>
#include <cudf/column/column.hpp>
#include <cudf/column/column_view.hpp>
#include <cudf/detail/iterator.cuh>
#include <cudf/scalar/scalar.hpp>
#include <cudf/scalar/scalar_device_view.cuh>
#include <cudf/scalar/scalar_factories.hpp>
Expand All @@ -30,6 +31,8 @@
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/table_utilities.hpp>

#include <thrust/iterator/counting_iterator.h>

#include <limits>
#include <type_traits>

Expand All @@ -55,6 +58,22 @@ TEST_F(TransformTest, BasicAddition)
cudf::test::expect_columns_equal(expected, result->view(), true);
}

TEST_F(TransformTest, BasicAdditionLarge)
{
auto a = thrust::make_counting_iterator(0);
auto col = column_wrapper<int32_t>(a, a + 2000);
auto table = cudf::table_view{{col, col}};

auto col_ref = cudf::ast::column_reference(0);
auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref);

auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
auto expected = column_wrapper<int32_t>(b, b + 2000);
auto result = cudf::ast::compute_column(table, expression);

cudf::test::expect_columns_equal(expected, result->view(), true);
}

TEST_F(TransformTest, LessComparator)
{
auto c_0 = column_wrapper<int32_t>{3, 20, 1, 50};
Expand All @@ -71,6 +90,25 @@ TEST_F(TransformTest, LessComparator)
cudf::test::expect_columns_equal(expected, result->view(), true);
}

TEST_F(TransformTest, LessComparatorLarge)
{
auto a = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
auto b = thrust::make_counting_iterator(500);
auto c_0 = column_wrapper<int32_t>(a, a + 2000);
auto c_1 = column_wrapper<int32_t>(b, b + 2000);
auto table = cudf::table_view{{c_0, c_1}};

auto col_ref_0 = cudf::ast::column_reference(0);
auto col_ref_1 = cudf::ast::column_reference(1);
auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1);

auto c = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 500; });
auto expected = column_wrapper<bool>(c, c + 2000);
auto result = cudf::ast::compute_column(table, expression);

cudf::test::expect_columns_equal(expected, result->view(), true);
}

TEST_F(TransformTest, MultiLevelTreeArithmetic)
{
auto c_0 = column_wrapper<int32_t>{3, 20, 1, 50};
Expand All @@ -97,6 +135,34 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic)
cudf::test::expect_columns_equal(expected, result->view(), true);
}

TEST_F(TransformTest, MultiLevelTreeArithmeticLarge)
{
using namespace cudf::ast;

auto a = thrust::make_counting_iterator(0);
auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i + 1; });
auto c = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; });
auto c_0 = column_wrapper<int32_t>(a, a + 2000);
auto c_1 = column_wrapper<int32_t>(b, b + 2000);
auto c_2 = column_wrapper<int32_t>(c, c + 2000);
auto table = cudf::table_view{{c_0, c_1, c_2}};

auto col_ref_0 = column_reference(0);
auto col_ref_1 = column_reference(1);
auto col_ref_2 = column_reference(2);

auto expr_left_subtree = expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1);
auto expr_right_subtree = expression(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0);
auto expr_tree = expression(ast_operator::SUB, expr_left_subtree, expr_right_subtree);

auto result = cudf::ast::compute_column(table, expr_tree);
auto calc = [](auto i) { return (i * (i + 1)) - (i + (i * 2)); };
auto d = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return calc(i); });
auto expected = column_wrapper<int32_t>(d, d + 2000);

cudf::test::expect_columns_equal(expected, result->view(), true);
}

TEST_F(TransformTest, ImbalancedTreeArithmetic)
{
auto c_0 = column_wrapper<double>{0.15, 0.37, 4.2, 21.3};
Expand Down