Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement scatter for struct columns #7752

Merged
merged 33 commits into from
Apr 1, 2021
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
Show all changes
33 commits
Select commit Hold shift + click to select a range
1ab2f3b
Add scatter for struct type
ttnghia Mar 29, 2021
174bc3e
Improve performance for scatter: only generate a gather_map if necessary
ttnghia Mar 29, 2021
46f9148
Add a partition test for partitioning a column of struct type
ttnghia Mar 29, 2021
c5112e8
Fix bitmask gathering during scattering data
ttnghia Mar 29, 2021
93d7251
Optimize bitmask gathering for struct type
ttnghia Mar 29, 2021
55152ff
Rewrite `scatter_to_gether`, changing from using `device_vector` to `…
ttnghia Mar 29, 2021
8735879
Re-organize source file list, and add `scatter_struct_tests.cu`
ttnghia Mar 29, 2021
81872d2
Implement test for empty scatter map
ttnghia Mar 29, 2021
ed91095
Rename variables, and add EmptyInputTest test case
ttnghia Mar 29, 2021
fd664b4
Add SimpleScatterTest test case
ttnghia Mar 29, 2021
ab66753
Rewrite test functions
ttnghia Mar 29, 2021
3528b10
Minor changes
ttnghia Mar 29, 2021
b3ee1a7
Rewrite tests for struct scattering
ttnghia Mar 30, 2021
9663c3c
Fix null mask: it should be copied from the target to the result befo…
ttnghia Mar 30, 2021
cade29c
Add some debugging, and fix null mask again
ttnghia Mar 30, 2021
723d0eb
Finish scatter test for complex struct data
ttnghia Mar 30, 2021
079d60b
Add scatter test for struct of lists
ttnghia Mar 30, 2021
35d6430
Remove redundant headers, and rewrite tests
ttnghia Mar 30, 2021
eff5d68
Rerwrite all tests
ttnghia Mar 30, 2021
846816f
Minor cleanup for the tests
ttnghia Mar 30, 2021
5aee3ec
Cleanup `scatter.cuh`
ttnghia Mar 30, 2021
c8a39bd
Merge remote-tracking branch 'origin/branch-0.19' into struct_scatter
ttnghia Mar 31, 2021
3d39a6c
Fix build error due to dependency order
ttnghia Mar 31, 2021
635753d
Cleanup header
ttnghia Mar 31, 2021
baab224
Rewrite comments
ttnghia Mar 31, 2021
08ea7fb
Fix types for typed tests
ttnghia Mar 31, 2021
b61a779
Rewrite tests
ttnghia Mar 31, 2021
580aa1f
Rewrite `scatter_to_gather_complement` function
ttnghia Mar 31, 2021
1111ff5
Replace `gather_rows` by `numeric_limits::lowest()`, and replace `thr…
ttnghia Mar 31, 2021
3f66dbd
Replace `gather_rows` by `numeric_limits::lowest()` again, and rewrit…
ttnghia Mar 31, 2021
72facd7
Small optimization for `gather()`: Avoid calling to `gather_bitmask` …
ttnghia Mar 31, 2021
d562bfc
Add comments
ttnghia Mar 31, 2021
871be28
Rewrite comments
ttnghia Mar 31, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 14 additions & 9 deletions cpp/include/cudf/detail/gather.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -567,15 +567,20 @@ struct column_gatherer_impl<struct_view, MapItRoot> {
mr);
});

gather_bitmask(
// Table view of struct column.
cudf::table_view{
std::vector<cudf::column_view>{structs_column.child_begin(), structs_column.child_end()}},
gather_map_begin,
output_struct_members,
nullify_out_of_bounds ? gather_bitmask_op::NULLIFY : gather_bitmask_op::DONT_CHECK,
stream,
mr);
auto const nullable = std::any_of(structs_column.child_begin(),
structs_column.child_end(),
[](auto const& col) { return col.nullable(); });
if (nullable) {
gather_bitmask(
// Table view of struct column.
cudf::table_view{
std::vector<cudf::column_view>{structs_column.child_begin(), structs_column.child_end()}},
gather_map_begin,
output_struct_members,
nullify_out_of_bounds ? gather_bitmask_op::NULLIFY : gather_bitmask_op::DONT_CHECK,
stream,
mr);
}

return cudf::make_structs_column(
gather_map_size,
Expand Down
84 changes: 78 additions & 6 deletions cpp/include/cudf/detail/scatter.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,8 @@
#include <rmm/cuda_stream_view.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/uninitialized_fill.h>

namespace cudf {
namespace detail {

Expand Down Expand Up @@ -66,7 +68,9 @@ auto scatter_to_gather(MapIterator scatter_map_begin,
// when calling the gather_bitmask() which applies a pass-through whenever it finds a
// value outside the range of the target column.
// We'll use the gather_rows value for this since it should always be outside the valid range.
auto gather_map = rmm::device_vector<size_type>(gather_rows, gather_rows);
auto gather_map = rmm::device_uvector<size_type>(gather_rows, stream);
thrust::uninitialized_fill(
rmm::exec_policy(stream), gather_map.begin(), gather_map.end(), gather_rows);

// Convert scatter map to a gather map
thrust::scatter(
Expand Down Expand Up @@ -206,6 +210,71 @@ struct column_scatterer_impl<dictionary32, MapIterator> {
}
};

template <typename MapItRoot>
struct column_scatterer_impl<struct_view, MapItRoot> {
std::unique_ptr<column> operator()(column_view const& source,
MapItRoot scatter_map_begin,
MapItRoot scatter_map_end,
column_view const& target,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr) const
{
CUDF_EXPECTS(source.num_children() == target.num_children(),
"Scatter source and target are not of the same type.");

auto const scatter_map_size = std::distance(scatter_map_begin, scatter_map_end);
if (scatter_map_size == 0) { return empty_like(source); }

structs_column_view structs_src(source);
structs_column_view structs_target(target);
std::vector<std::unique_ptr<column>> output_struct_members(structs_src.num_children());

std::transform(structs_src.child_begin(),
structs_src.child_end(),
structs_target.child_begin(),
output_struct_members.begin(),
[&scatter_map_begin, &scatter_map_end, stream, mr](auto const& source_col,
auto const& target_col) {
return type_dispatcher<dispatch_storage_type>(source_col.type(),
column_scatterer<MapItRoot>{},
source_col,
scatter_map_begin,
scatter_map_end,
target_col,
stream,
mr);
});

auto const nullable = std::any_of(structs_src.child_begin(),
structs_src.child_end(),
[](auto const& col) { return col.nullable(); }) or
std::any_of(structs_target.child_begin(),
structs_target.child_end(),
[](auto const& col) { return col.nullable(); });
if (nullable) {
auto const gather_map =
scatter_to_gather(scatter_map_begin, scatter_map_end, source.size(), stream);
gather_bitmask(
// Table view of struct column.
cudf::table_view{
std::vector<cudf::column_view>{structs_src.child_begin(), structs_src.child_end()}},
gather_map.begin(),
output_struct_members,
gather_bitmask_op::NULLIFY,
stream,
mr);
}

return cudf::make_structs_column(
source.size(),
std::move(output_struct_members),
0,
rmm::device_buffer{0, stream, mr}, // Null mask will be fixed up in cudf::scatter().
stream,
mr);
}
};

/**
* @brief Scatters the rows of the source table into a copy of the target table
* according to a scatter map.
Expand Down Expand Up @@ -293,11 +362,14 @@ std::unique_ptr<table> scatter(
mr);
});

auto gather_map = scatter_to_gather(
updated_scatter_map_begin, updated_scatter_map_end, target.num_rows(), stream);

gather_bitmask(source, gather_map.begin(), result, gather_bitmask_op::PASSTHROUGH, stream, mr);

auto const nullable =
std::any_of(source.begin(), source.end(), [](auto const& col) { return col.nullable(); }) or
std::any_of(target.begin(), target.end(), [](auto const& col) { return col.nullable(); });
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
if (nullable) {
auto gather_map = scatter_to_gather(
updated_scatter_map_begin, updated_scatter_map_end, target.num_rows(), stream);
gather_bitmask(source, gather_map.begin(), result, gather_bitmask_op::PASSTHROUGH, stream, mr);
}
return std::make_unique<table>(std::move(result));
}
} // namespace detail
Expand Down
29 changes: 29 additions & 0 deletions cpp/tests/partitioning/partition_test.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -141,6 +141,35 @@ TYPED_TEST(PartitionTest, Identity)
run_partition_test(table_to_partition, map, 6, table_to_partition, expected_offsets);
}

TYPED_TEST(PartitionTest, Struct)
{
using value_type = cudf::test::GetType<TypeParam, 0>;
using map_type = cudf::test::GetType<TypeParam, 1>;

fixed_width_column_wrapper<value_type, int32_t> A({1, 2}, {0, 1});
auto struct_col = cudf::test::structs_column_wrapper({A}, {0, 1}).release();
auto table_to_partition = cudf::table_view{{*struct_col}};

fixed_width_column_wrapper<map_type> map{9, 2};

fixed_width_column_wrapper<value_type, int32_t> A_expected({2, 1}, {1, 0});
auto struct_expected = cudf::test::structs_column_wrapper({A_expected}, {1, 0}).release();
auto expected = cudf::table_view{{*struct_expected}};

std::vector<cudf::size_type> expected_offsets{0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2};

// This does not work because we cannot sort a struct right now...
// run_partition_test(table_to_partition, map, 12, expected, expected_offsets);
// But there is no ambiguity in the ordering so I'll just copy it all here for now.
auto num_partitions = 12;
auto result = cudf::partition(table_to_partition, map, num_partitions);
auto const& actual_partitioned_table = result.first;
auto const& actual_offsets = result.second;
EXPECT_EQ(actual_offsets, expected_offsets);

CUDF_TEST_EXPECT_TABLES_EQUAL(expected, *actual_partitioned_table);
}

TYPED_TEST(PartitionTest, Reverse)
{
using value_type = cudf::test::GetType<TypeParam, 0>;
Expand Down