diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 4386a62e32a..a46b281a7dd 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -55,7 +55,7 @@ test: - test -f $PREFIX/include/cudf/ast/transform.hpp - test -f $PREFIX/include/cudf/ast/detail/linearizer.hpp - test -f $PREFIX/include/cudf/ast/detail/operators.hpp - - test -f $PREFIX/include/cudf/ast/linearizer.hpp + - test -f $PREFIX/include/cudf/ast/nodes.hpp - test -f $PREFIX/include/cudf/ast/operators.hpp - test -f $PREFIX/include/cudf/binaryop.hpp - test -f $PREFIX/include/cudf/labeling/label_bins.hpp diff --git a/cpp/include/cudf/ast/detail/linearizer.hpp b/cpp/include/cudf/ast/detail/linearizer.hpp index 68319a24e5d..166a0408703 100644 --- a/cpp/include/cudf/ast/detail/linearizer.hpp +++ b/cpp/include/cudf/ast/detail/linearizer.hpp @@ -83,10 +83,7 @@ class linearizer; * This class is a part of a "visitor" pattern with the `linearizer` class. * Nodes inheriting from this class can accept visitors. */ -class node { - friend class detail::linearizer; - - private: +struct node { virtual cudf::size_type accept(detail::linearizer& visitor) const = 0; }; @@ -102,10 +99,6 @@ class node { * resolved into intermediate data storage in shared memory. */ class linearizer { - friend class literal; - friend class column_reference; - friend class expression; - public: /** * @brief Construct a new linearizer object diff --git a/cpp/include/cudf/ast/detail/operators.hpp b/cpp/include/cudf/ast/detail/operators.hpp index 27bcb0d320b..8ae60f96997 100644 --- a/cpp/include/cudf/ast/detail/operators.hpp +++ b/cpp/include/cudf/ast/detail/operators.hpp @@ -753,43 +753,6 @@ struct operator_functor { } }; -#if 0 -/** - * @brief Functor used to double-type-dispatch binary operators. - * - * This functor's `operator()` is templated to validate calls to its operators based on the input - * type, as determined by the `is_valid_binary_op` trait. - * - * @tparam OperatorFunctor Binary operator functor. - */ -template -struct double_dispatch_binary_operator_types { - template >* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args) - { - f.template operator()(std::forward(args)...); - } - - template >* = nullptr> - CUDA_HOST_DEVICE_CALLABLE void operator()(F&& f, Ts&&... args) - { -#ifndef __CUDA_ARCH__ - CUDF_FAIL("Invalid binary operation."); -#else - cudf_assert(false && "Invalid binary operation."); -#endif - } -}; -#endif - /** * @brief Functor used to single-type-dispatch binary operators. * @@ -856,16 +819,6 @@ struct type_dispatch_binary_op { F&& f, Ts&&... args) { -#if 0 - // Double dispatch - /* - double_type_dispatcher(lhs_type, - rhs_type, - detail::double_dispatch_binary_operator_types>{}, - std::forward(f), - std::forward(args)...); - */ -#endif // Single dispatch (assume lhs_type == rhs_type) type_dispatcher(lhs_type, detail::single_dispatch_binary_operator_types>{}, diff --git a/cpp/include/cudf/ast/detail/transform.cuh b/cpp/include/cudf/ast/detail/transform.cuh index f9d7426e2e4..f69927a3601 100644 --- a/cpp/include/cudf/ast/detail/transform.cuh +++ b/cpp/include/cudf/ast/detail/transform.cuh @@ -15,8 +15,9 @@ */ #pragma once +#include #include -#include +#include #include #include #include @@ -25,6 +26,7 @@ #include #include #include +#include #include #include @@ -155,10 +157,11 @@ struct row_evaluator { * storing intermediates. * @param output_column The output column where results are stored. */ - __device__ row_evaluator(table_device_view const& table, - const cudf::detail::fixed_width_scalar_device_view_base* literals, - std::int64_t* thread_intermediate_storage, - mutable_column_device_view* output_column) + __device__ row_evaluator( + table_device_view const& table, + device_span literals, + std::int64_t* thread_intermediate_storage, + mutable_column_device_view* output_column) : table(table), literals(literals), thread_intermediate_storage(thread_intermediate_storage), @@ -264,7 +267,7 @@ struct row_evaluator { private: table_device_view const& table; - const cudf::detail::fixed_width_scalar_device_view_base* literals; + device_span literals; std::int64_t* thread_intermediate_storage; mutable_column_device_view* output_column; }; @@ -298,15 +301,15 @@ __device__ void row_output::resolve_output(detail::device_data_reference device_ * @param num_operators Number of operators. * @param row_index Row index of data column(s). */ -__device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, - const detail::device_data_reference* data_references, - const ast_operator* operators, - const cudf::size_type* operator_source_indices, - cudf::size_type num_operators, - cudf::size_type row_index) +__device__ void evaluate_row_expression( + detail::row_evaluator const& evaluator, + device_span data_references, + device_span operators, + device_span operator_source_indices, + cudf::size_type row_index) { - auto operator_source_index = cudf::size_type(0); - for (cudf::size_type operator_index(0); operator_index < num_operators; operator_index++) { + auto operator_source_index = static_cast(0); + for (cudf::size_type operator_index = 0; operator_index < operators.size(); operator_index++) { // Execute operator auto const op = operators[operator_index]; auto const arity = ast_operator_arity(op); @@ -336,41 +339,79 @@ __device__ void evaluate_row_expression(detail::row_evaluator const& evaluator, } } +/** + * @brief The AST plan creates a device buffer of data needed to execute an AST. + * + * On construction, an AST plan creates a single "packed" host buffer of all necessary data arrays, + * and copies that to the device with a single host-device memory copy. Because the plan tends to be + * small, this is the most efficient approach for low latency. + * + */ struct ast_plan { - public: - ast_plan() : sizes(), data_pointers() {} + ast_plan(linearizer const& expr_linearizer, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) + : _sizes{}, _data_pointers{} + { + add_to_plan(expr_linearizer.data_references()); + add_to_plan(expr_linearizer.literals()); + add_to_plan(expr_linearizer.operators()); + add_to_plan(expr_linearizer.operator_source_indices()); + + // Create device buffer + auto const buffer_size = std::accumulate(_sizes.cbegin(), _sizes.cend(), 0); + auto buffer_offsets = std::vector(_sizes.size()); + thrust::exclusive_scan(_sizes.cbegin(), _sizes.cend(), buffer_offsets.begin(), 0); + + auto h_data_buffer = std::make_unique(buffer_size); + for (unsigned int i = 0; i < _data_pointers.size(); ++i) { + std::memcpy(h_data_buffer.get() + buffer_offsets[i], _data_pointers[i], _sizes[i]); + } - using buffer_type = std::pair, int>; + _device_data_buffer = rmm::device_buffer(h_data_buffer.get(), buffer_size, stream, mr); + + stream.synchronize(); + + // Create device pointers to components of plan + auto device_data_buffer_ptr = static_cast(_device_data_buffer.data()); + _device_data_references = device_span( + reinterpret_cast(device_data_buffer_ptr + + buffer_offsets[0]), + expr_linearizer.data_references().size()); + _device_literals = device_span( + reinterpret_cast( + device_data_buffer_ptr + buffer_offsets[1]), + expr_linearizer.literals().size()); + _device_operators = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]), + expr_linearizer.operators().size()); + _device_operator_source_indices = device_span( + reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]), + expr_linearizer.operator_source_indices().size()); + } + /** + * @brief Helper function for adding components (operators, literals, etc) to AST plan + * + * @tparam T The underlying type of the input `std::vector` + * @param v The `std::vector` containing components (operators, literals, etc) + */ template void add_to_plan(std::vector const& v) { auto const data_size = sizeof(T) * v.size(); - sizes.push_back(data_size); - data_pointers.push_back(v.data()); + _sizes.push_back(data_size); + _data_pointers.push_back(v.data()); } - buffer_type get_host_data_buffer() const - { - auto const total_size = std::accumulate(sizes.cbegin(), sizes.cend(), 0); - auto host_data_buffer = std::make_unique(total_size); - auto const offsets = get_offsets(); - for (unsigned int i = 0; i < data_pointers.size(); ++i) { - std::memcpy(host_data_buffer.get() + offsets[i], data_pointers[i], sizes[i]); - } - return std::make_pair(std::move(host_data_buffer), total_size); - } + std::vector _sizes; + std::vector _data_pointers; - std::vector get_offsets() const - { - auto offsets = std::vector(sizes.size()); - thrust::exclusive_scan(sizes.cbegin(), sizes.cend(), offsets.begin(), 0); - return offsets; - } - - private: - std::vector sizes; - std::vector data_pointers; + rmm::device_buffer _device_data_buffer; + device_span _device_data_references; + device_span _device_literals; + device_span _device_operators; + device_span _device_operator_source_indices; }; /** diff --git a/cpp/include/cudf/ast/linearizer.hpp b/cpp/include/cudf/ast/nodes.hpp similarity index 90% rename from cpp/include/cudf/ast/linearizer.hpp rename to cpp/include/cudf/ast/nodes.hpp index e5ccb2e8069..70dda58816e 100644 --- a/cpp/include/cudf/ast/linearizer.hpp +++ b/cpp/include/cudf/ast/nodes.hpp @@ -38,17 +38,10 @@ enum class table_reference { OUTPUT // Column index in the output table }; -// Forward declaration -class literal; -class column_reference; -class expression; - /** * @brief A literal value used in an abstract syntax tree. */ class literal : public detail::node { - friend class detail::linearizer; - public: /** * @brief Construct a new literal object. @@ -90,7 +83,6 @@ class literal : public detail::node { */ cudf::data_type get_data_type() const { return get_value().type(); } - private: /** * @brief Get the value object. * @@ -106,6 +98,7 @@ class literal : public detail::node { */ cudf::size_type accept(detail::linearizer& visitor) const override; + private: const cudf::detail::fixed_width_scalar_device_view_base value; }; @@ -113,8 +106,6 @@ class literal : public detail::node { * @brief A node referring to data from a column in a table. */ class column_reference : public detail::node { - friend class detail::linearizer; - public: /** * @brief Construct a new column reference object @@ -175,7 +166,6 @@ class column_reference : public detail::node { return table.column(get_column_index()).type(); } - private: /** * @brief Accepts a visitor class. * @@ -184,6 +174,7 @@ class column_reference : public detail::node { */ cudf::size_type accept(detail::linearizer& visitor) const override; + private: cudf::size_type column_index; table_reference table_source; }; @@ -192,8 +183,6 @@ class column_reference : public detail::node { * @brief An expression node holds an operator and zero or more operands. */ class expression : public detail::node { - friend class detail::linearizer; - public: /** * @brief Construct a new unary expression object. @@ -208,11 +197,6 @@ class expression : public detail::node { } } - /** - * @brief `expression` doesn't accept r-value references for expression nodes - */ - expression(ast_operator op, node&& input) = delete; - /** * @brief Construct a new binary expression object. * @@ -227,19 +211,11 @@ class expression : public detail::node { } } - /** - * @brief `expression` doesn't accept r-value references for expression nodes - */ - expression(ast_operator op, node&& left, node&& right) = delete; - - /** - * @brief `expression` doesn't accept r-value references for expression nodes - */ + // expression only stores references to nodes, so it does not accept r-value + // references: the calling code must own the nodes. + expression(ast_operator op, node&& input) = delete; + expression(ast_operator op, node&& left, node&& right) = delete; expression(ast_operator op, node&& left, node const& right) = delete; - - /** - * @brief `expression` doesn't accept r-value references for expression nodes - */ expression(ast_operator op, node const& left, node&& right) = delete; /** @@ -256,7 +232,6 @@ class expression : public detail::node { */ std::vector> get_operands() const { return operands; } - private: /** * @brief Accepts a visitor class. * @@ -265,6 +240,7 @@ class expression : public detail::node { */ cudf::size_type accept(detail::linearizer& visitor) const override; + private: const ast_operator op; const std::vector> operands; }; diff --git a/cpp/include/cudf/ast/transform.hpp b/cpp/include/cudf/ast/transform.hpp index 513f92ea251..59697e5f75c 100644 --- a/cpp/include/cudf/ast/transform.hpp +++ b/cpp/include/cudf/ast/transform.hpp @@ -15,7 +15,7 @@ */ #pragma once -#include +#include #include namespace cudf { diff --git a/cpp/src/ast/linearizer.cpp b/cpp/src/ast/linearizer.cpp index cc70845e1ff..66a32ead35e 100644 --- a/cpp/src/ast/linearizer.cpp +++ b/cpp/src/ast/linearizer.cpp @@ -14,7 +14,7 @@ * limitations under the License. */ #include -#include +#include #include #include #include diff --git a/cpp/src/ast/transform.cu b/cpp/src/ast/transform.cu index bc055d46869..43d3bde97c2 100644 --- a/cpp/src/ast/transform.cu +++ b/cpp/src/ast/transform.cu @@ -15,7 +15,7 @@ */ #include -#include +#include #include #include #include @@ -61,27 +61,25 @@ namespace detail { * each thread. */ template -__launch_bounds__(max_block_size) __global__ - void compute_column_kernel(table_device_view const table, - const cudf::detail::fixed_width_scalar_device_view_base* literals, - mutable_column_device_view output_column, - const detail::device_data_reference* data_references, - const ast_operator* operators, - const cudf::size_type* operator_source_indices, - cudf::size_type num_operators, - cudf::size_type num_intermediates) +__launch_bounds__(max_block_size) __global__ void compute_column_kernel( + table_device_view const table, + device_span literals, + mutable_column_device_view output_column, + device_span data_references, + device_span operators, + device_span operator_source_indices, + cudf::size_type num_intermediates) { extern __shared__ std::int64_t intermediate_storage[]; auto thread_intermediate_storage = &intermediate_storage[threadIdx.x * num_intermediates]; - auto const start_idx = cudf::size_type(threadIdx.x + blockIdx.x * blockDim.x); - auto const stride = cudf::size_type(blockDim.x * gridDim.x); - auto const num_rows = table.num_rows(); + auto const start_idx = static_cast(threadIdx.x + blockIdx.x * blockDim.x); + auto const stride = static_cast(blockDim.x * gridDim.x); auto const evaluator = cudf::ast::detail::row_evaluator(table, literals, thread_intermediate_storage, &output_column); - for (cudf::size_type row_index = start_idx; row_index < num_rows; row_index += stride) { + for (cudf::size_type row_index = start_idx; row_index < table.num_rows(); row_index += stride) { evaluate_row_expression( - evaluator, data_references, operators, operator_source_indices, num_operators, row_index); + evaluator, data_references, operators, operator_source_indices, row_index); } } @@ -90,40 +88,8 @@ std::unique_ptr compute_column(table_view const table, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - // Linearize the AST - auto const expr_linearizer = linearizer(expr, table); - auto const data_references = expr_linearizer.data_references(); - auto const literals = expr_linearizer.literals(); - auto const operators = expr_linearizer.operators(); - auto const num_operators = cudf::size_type(operators.size()); - auto const operator_source_indices = expr_linearizer.operator_source_indices(); - auto const expr_data_type = expr_linearizer.root_data_type(); - - // Create ast_plan and device buffer - auto plan = ast_plan(); - plan.add_to_plan(data_references); - plan.add_to_plan(literals); - plan.add_to_plan(operators); - plan.add_to_plan(operator_source_indices); - auto const host_data_buffer = plan.get_host_data_buffer(); - auto const buffer_offsets = plan.get_offsets(); - auto const buffer_size = host_data_buffer.second; - auto device_data_buffer = - rmm::device_buffer(host_data_buffer.first.get(), buffer_size, stream, mr); - // To reduce overhead, we don't call a stream sync here. - // The stream is synced later when the table_device_view is created. - - // Create device pointers to components of plan - auto const device_data_buffer_ptr = static_cast(device_data_buffer.data()); - auto const device_data_references = reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[0]); - auto const device_literals = - reinterpret_cast( - device_data_buffer_ptr + buffer_offsets[1]); - auto const device_operators = - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[2]); - auto const device_operator_source_indices = - reinterpret_cast(device_data_buffer_ptr + buffer_offsets[3]); + auto const expr_linearizer = linearizer(expr, table); // Linearize the AST + auto const plan = ast_plan{expr_linearizer, stream, mr}; // Create ast_plan // Create table device view auto table_device = table_device_view::create(table, stream); @@ -131,7 +97,7 @@ std::unique_ptr compute_column(table_view const table, // Prepare output column auto output_column = cudf::make_fixed_width_column( - expr_data_type, table_num_rows, mask_state::UNALLOCATED, stream, mr); + expr_linearizer.root_data_type(), table_num_rows, mask_state::UNALLOCATED, stream, mr); auto mutable_output_device = cudf::mutable_column_device_view::create(output_column->mutable_view(), stream); @@ -155,12 +121,11 @@ std::unique_ptr compute_column(table_view const table, cudf::ast::detail::compute_column_kernel <<>>( *table_device, - device_literals, + plan._device_literals, *mutable_output_device, - device_data_references, - device_operators, - device_operator_source_indices, - num_operators, + plan._device_data_references, + plan._device_operators, + plan._device_operator_source_indices, num_intermediates); CHECK_CUDA(stream.value()); return output_column; diff --git a/cpp/tests/ast/transform_tests.cpp b/cpp/tests/ast/transform_tests.cpp index 8f4a46e2a54..74937d4deea 100644 --- a/cpp/tests/ast/transform_tests.cpp +++ b/cpp/tests/ast/transform_tests.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -30,6 +31,8 @@ #include #include +#include + #include #include @@ -55,6 +58,22 @@ TEST_F(TransformTest, BasicAddition) cudf::test::expect_columns_equal(expected, result->view(), true); } +TEST_F(TransformTest, BasicAdditionLarge) +{ + auto a = thrust::make_counting_iterator(0); + auto col = column_wrapper(a, a + 2000); + auto table = cudf::table_view{{col, col}}; + + auto col_ref = cudf::ast::column_reference(0); + auto expression = cudf::ast::expression(cudf::ast::ast_operator::ADD, col_ref, col_ref); + + auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); + auto expected = column_wrapper(b, b + 2000); + auto result = cudf::ast::compute_column(table, expression); + + cudf::test::expect_columns_equal(expected, result->view(), true); +} + TEST_F(TransformTest, LessComparator) { auto c_0 = column_wrapper{3, 20, 1, 50}; @@ -71,6 +90,25 @@ TEST_F(TransformTest, LessComparator) cudf::test::expect_columns_equal(expected, result->view(), true); } +TEST_F(TransformTest, LessComparatorLarge) +{ + auto a = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); + auto b = thrust::make_counting_iterator(500); + auto c_0 = column_wrapper(a, a + 2000); + auto c_1 = column_wrapper(b, b + 2000); + auto table = cudf::table_view{{c_0, c_1}}; + + auto col_ref_0 = cudf::ast::column_reference(0); + auto col_ref_1 = cudf::ast::column_reference(1); + auto expression = cudf::ast::expression(cudf::ast::ast_operator::LESS, col_ref_0, col_ref_1); + + auto c = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i < 500; }); + auto expected = column_wrapper(c, c + 2000); + auto result = cudf::ast::compute_column(table, expression); + + cudf::test::expect_columns_equal(expected, result->view(), true); +} + TEST_F(TransformTest, MultiLevelTreeArithmetic) { auto c_0 = column_wrapper{3, 20, 1, 50}; @@ -97,6 +135,34 @@ TEST_F(TransformTest, MultiLevelTreeArithmetic) cudf::test::expect_columns_equal(expected, result->view(), true); } +TEST_F(TransformTest, MultiLevelTreeArithmeticLarge) +{ + using namespace cudf::ast; + + auto a = thrust::make_counting_iterator(0); + auto b = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i + 1; }); + auto c = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i * 2; }); + auto c_0 = column_wrapper(a, a + 2000); + auto c_1 = column_wrapper(b, b + 2000); + auto c_2 = column_wrapper(c, c + 2000); + auto table = cudf::table_view{{c_0, c_1, c_2}}; + + auto col_ref_0 = column_reference(0); + auto col_ref_1 = column_reference(1); + auto col_ref_2 = column_reference(2); + + auto expr_left_subtree = expression(cudf::ast::ast_operator::MUL, col_ref_0, col_ref_1); + auto expr_right_subtree = expression(cudf::ast::ast_operator::ADD, col_ref_2, col_ref_0); + auto expr_tree = expression(ast_operator::SUB, expr_left_subtree, expr_right_subtree); + + auto result = cudf::ast::compute_column(table, expr_tree); + auto calc = [](auto i) { return (i * (i + 1)) - (i + (i * 2)); }; + auto d = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { return calc(i); }); + auto expected = column_wrapper(d, d + 2000); + + cudf::test::expect_columns_equal(expected, result->view(), true); +} + TEST_F(TransformTest, ImbalancedTreeArithmetic) { auto c_0 = column_wrapper{0.15, 0.37, 4.2, 21.3};