diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index ebdf6a3fb12..037bb70ab77 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -137,6 +137,7 @@ test: - test -f $PREFIX/include/cudf/lists/detail/drop_list_duplicates.hpp - test -f $PREFIX/include/cudf/lists/detail/interleave_columns.hpp - test -f $PREFIX/include/cudf/lists/detail/sorting.hpp + - test -f $PREFIX/include/cudf/lists/concatenate_rows.hpp - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/explode.hpp - test -f $PREFIX/include/cudf/lists/drop_list_duplicates.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 1689de29b05..198690e37ff 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -263,6 +263,7 @@ add_library(cudf src/join/join.cu src/join/semi_join.cu src/lists/contains.cu + src/lists/concatenate_rows.cu src/lists/copying/concatenate.cu src/lists/copying/copying.cu src/lists/copying/gather.cu diff --git a/cpp/include/cudf/lists/concatenate_rows.hpp b/cpp/include/cudf/lists/concatenate_rows.hpp new file mode 100644 index 00000000000..1d93de418f8 --- /dev/null +++ b/cpp/include/cudf/lists/concatenate_rows.hpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace lists { +/** + * @addtogroup lists_concatenate_rows + * @{ + * @file + */ + +/* + * @brief Flag to specify whether a null list element will be ignored from concatenation, or the + * entire concatenation result involving null list elements will be a null element. + */ +enum class concatenate_null_policy { IGNORE, NULLIFY_OUTPUT_ROW }; + +/** + * @brief Row-wise concatenating multiple lists columns into a single lists column. + * + * The output column is generated by concatenating the elements within each row of the input + * table. If any row of the input table contains null elements, the concatenation process will + * either ignore those null elements, or will simply set the entire resulting row to be a null + * element. + * + * @code{.pseudo} + * s1 = [{0, 1}, {2, 3, 4}, {5}, {}, {6, 7}] + * s2 = [{8}, {9}, {}, {10, 11, 12}, {13, 14, 15, 16}] + * r = lists::concatenate_rows(s1, s2) + * r is now [{0, 1, 8}, {2, 3, 4, 9}, {5}, {10, 11, 12}, {6, 7, 13, 14, 15, 16}] + * @endcode + * + * @throws cudf::logic_error if any column of the input table is not a lists columns. + * @throws cudf::logic_error if any lists column contains nested typed entry. + * @throws cudf::logic_error if all lists columns do not have the same entry type. + * + * @param input Table of lists to be concatenated. + * @param null_policy The parameter to specify whether a null list element will be ignored from + * concatenation, or any concatenation involving a null list element will result in a null list. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return A new column in which each row is a list resulted from concatenating all list elements in + * the corresponding row of the input table. + */ +std::unique_ptr concatenate_rows( + table_view const& input, + concatenate_null_policy null_policy = concatenate_null_policy::IGNORE, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of group +} // namespace lists +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index f78ff98d49d..11b907e7f16 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -143,6 +143,7 @@ * @} * @defgroup lists_apis Lists * @{ + * @defgroup lists_concatenate_rows Combining * @defgroup lists_extract Extracting * @defgroup lists_contains Searching * @defgroup lists_gather Gathering diff --git a/cpp/src/lists/concatenate_rows.cu b/cpp/src/lists/concatenate_rows.cu new file mode 100644 index 00000000000..51df7255df9 --- /dev/null +++ b/cpp/src/lists/concatenate_rows.cu @@ -0,0 +1,438 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +namespace { +std::unique_ptr concatenate_rows_ignore_null(table_view const& input, + bool has_null_mask, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_output_lists = input.num_rows(); + auto const table_dv_ptr = table_device_view::create(input); + + // Interleave the list element from the input table, thus all the lists at the same row now stay + // next to each other. + auto interleaved_columns = detail::interleave_columns(input, has_null_mask, stream); + + // Modify the list offsets to combine lists of the same input row. + static_assert(sizeof(offset_type) == sizeof(int32_t)); + static_assert(sizeof(size_type) == sizeof(int32_t)); + auto list_offsets = make_numeric_column( + data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr); + auto const d_offsets = list_offsets->mutable_view().template begin(); + + // The array of int8_t to store validities for list elements. + // Since we combine multiple lists, we need to recompute list validities. + auto validities = rmm::device_uvector(has_null_mask ? num_output_lists : 0, stream); + + // For an input table of `n` columns, if after interleaving we have the list offsets are + // [ i_0, i_1, ..., i_n, i_n+1, ..., i_2n, ... ] then to concatenate them just modify the offsets + // to be [ i_0, i_n, i_2n, i_3n, ... ]. + auto const d_interleaved_offsets = lists_column_view(interleaved_columns->view()).offsets_begin(); + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_output_lists + 1), + d_offsets, + [d_interleaved_offsets, + num_cols = input.num_columns(), + table_dv = *table_dv_ptr, + d_validities = validities.begin(), + has_null_mask] __device__(auto const idx) { + if (has_null_mask) { + auto const any_valid = thrust::any_of( + thrust::seq, table_dv.begin(), table_dv.end(), [idx](auto const& list_col) { + return list_col.is_valid(idx); + }); + d_validities[idx] = static_cast(any_valid); + } + return d_interleaved_offsets[idx * num_cols]; + }); + + auto [null_mask, null_count] = [&] { + return has_null_mask + ? cudf::detail::valid_if( + validities.begin(), validities.end(), thrust::identity{}, stream, mr) + : std::make_pair(rmm::device_buffer{}, size_type{0}); + }(); + + // The child column containing list entries is taken from the `interleaved_columns` column. + auto interleaved_columns_content = interleaved_columns->release(); + + return make_lists_column( + num_output_lists, + std::move(list_offsets), + std::move(interleaved_columns_content.children[lists_column_view::child_column_index]), + null_count, + null_count > 0 ? std::move(null_mask) : rmm::device_buffer{}, + stream, + mr); +} + +/** + * @brief Generate list offsets and list validities for the output lists column from the table_view + * of the input lists columns. + * + * This function is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). + */ +std::pair, rmm::device_uvector> +generate_list_offsets_and_validities(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const num_output_lists = input.num_rows(); + auto const table_dv_ptr = table_device_view::create(input); + + // The output offsets column. + static_assert(sizeof(offset_type) == sizeof(int32_t)); + static_assert(sizeof(size_type) == sizeof(int32_t)); + auto list_offsets = make_numeric_column( + data_type{type_id::INT32}, num_output_lists + 1, mask_state::UNALLOCATED, stream, mr); + auto const d_offsets = list_offsets->mutable_view().template begin(); + + // The array of int8_t to store validities for list elements. + auto validities = rmm::device_uvector(num_output_lists, stream); + + // Compute list sizes and validities. + thrust::transform( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(num_output_lists), + d_offsets, + [table_dv = *table_dv_ptr, d_validities = validities.begin()] __device__(size_type const idx) { + auto const all_valid = + thrust::all_of(thrust::seq, table_dv.begin(), table_dv.end(), [idx](auto const& list_col) { + return list_col.is_valid(idx); + }); + d_validities[idx] = static_cast(all_valid); + if (not all_valid) return size_type{0}; + + // Compute size of the output list as sum of sizes of input lists + return thrust::transform_reduce( + thrust::seq, + table_dv.begin(), + table_dv.end(), + [idx] __device__(auto const& lists_col) { + auto const list_offsets = + lists_col.child(lists_column_view::offsets_column_index).template data() + + lists_col.offset(); + return list_offsets[idx + 1] - list_offsets[idx]; // list size + }, + size_type{0}, + thrust::plus{}); + }); + + // Compute offsets from sizes. + thrust::exclusive_scan( + rmm::exec_policy(stream), d_offsets, d_offsets + num_output_lists + 1, d_offsets); + + return {std::move(list_offsets), std::move(validities)}; +} + +/** + * @brief Compute string sizes, string validities, and concatenate string lists functor. + * + * This functor is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). + * It is executed twice. In the first pass, the sizes and validities of the output strings will be + * computed. In the second pass, this will concatenate the lists of strings of the given table of + * lists columns in a row-wise manner. + */ +struct compute_string_sizes_and_concatenate_lists_fn { + table_device_view const table_dv; + + // Store list offsets of the output lists column. + offset_type const* const dst_list_offsets; + + // Store offsets of the strings. + offset_type* d_offsets{nullptr}; + + // If d_chars == nullptr: only compute sizes and validities of the output strings. + // If d_chars != nullptr: only concatenate strings. + char* d_chars{nullptr}; + + // We need to set `1` or `0` for the validities of the strings in the child column. + int8_t* d_validities{nullptr}; + + __device__ void operator()(size_type const idx) + { + // The current row contain null, which has been identified during `dst_list_offsets` + // computation. + if (dst_list_offsets[idx + 1] == dst_list_offsets[idx]) { return; } + + // read_idx and write_idx are indices of string elements. + size_type write_idx = dst_list_offsets[idx]; + thrust::for_each( + thrust::seq, table_dv.begin(), table_dv.end(), [&] __device__(auto const& lists_col) { + auto const list_offsets = + lists_col.child(lists_column_view::offsets_column_index).template data() + + lists_col.offset(); + auto const& str_col = lists_col.child(lists_column_view::child_column_index); + auto const str_offsets = + str_col.child(strings_column_view::offsets_column_index).template data(); + + // The indices of the strings within the source list. + auto const start_str_idx = list_offsets[idx]; + auto const end_str_idx = list_offsets[idx + 1]; + + if (not d_chars) { // just compute sizes of strings within a list + for (auto read_idx = start_str_idx; read_idx < end_str_idx; ++read_idx, ++write_idx) { + d_validities[write_idx] = static_cast(str_col.is_valid(read_idx)); + d_offsets[write_idx] = str_offsets[read_idx + 1] - str_offsets[read_idx]; + } + } else { // just copy the entire memory region containing all strings in the list + // start_byte and end_byte are indices of character of the string elements. + auto const start_byte = str_offsets[start_str_idx]; + auto const end_byte = str_offsets[end_str_idx]; + if (start_byte < end_byte) { + auto const input_ptr = + str_col.child(strings_column_view::chars_column_index).template data() + + start_byte; + auto const output_ptr = d_chars + d_offsets[write_idx]; + thrust::copy(thrust::seq, input_ptr, input_ptr + end_byte - start_byte, output_ptr); + write_idx += end_str_idx - start_str_idx; + } + } + }); + } +}; + +/** + * @brief Struct used in type_dispatcher to interleave list entries of the input lists columns and + * output the results into a destination column. + * + * This functor is called only when (has_null_mask == true and null_policy == NULLIFY_OUTPUT_ROW). + */ +struct concatenate_lists_fn { + template + std::enable_if_t, std::unique_ptr> operator()( + table_view const& input, + column_view const& output_list_offsets, + size_type num_output_lists, + size_type num_output_entries, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const noexcept + { + auto const table_dv_ptr = table_device_view::create(input); + auto const comp_fn = compute_string_sizes_and_concatenate_lists_fn{ + *table_dv_ptr, output_list_offsets.template begin()}; + + // Generate a null mask because the input table has nullable column. + auto [offsets_column, chars_column, null_mask, null_count] = + cudf::strings::detail::make_strings_children_with_null_mask( + comp_fn, num_output_lists, num_output_entries, stream, mr); + + return make_strings_column(num_output_entries, + std::move(offsets_column), + std::move(chars_column), + null_count, + std::move(null_mask), + stream, + mr); + } + + template + std::enable_if_t(), std::unique_ptr> operator()( + table_view const& input, + column_view const& output_list_offsets, + size_type num_output_lists, + size_type num_output_entries, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) const noexcept + { + auto const table_dv_ptr = table_device_view::create(input); + + // The output child column. + auto const child_col = lists_column_view(*input.begin()).child(); + auto output = + allocate_like(child_col, num_output_entries, mask_allocation_policy::NEVER, stream, mr); + auto output_dv_ptr = mutable_column_device_view::create(*output); + + // The array of int8_t to store entry validities. + auto validities = rmm::device_uvector(num_output_entries, stream); + + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + num_output_lists, + [num_cols = input.num_columns(), + table_dv = *table_dv_ptr, + d_validities = validities.begin(), + dst_list_offsets = output_list_offsets.template begin(), + d_output = output_dv_ptr->template begin()] __device__(size_type const idx) { + // The output row has been identified as a null list during list size computation. + if (dst_list_offsets[idx + 1] == dst_list_offsets[idx]) { return; } + + auto write_start = dst_list_offsets[idx]; + thrust::for_each( + thrust::seq, table_dv.begin(), table_dv.end(), [&] __device__(auto const& lists_col) { + auto const list_offsets = lists_col.child(lists_column_view::offsets_column_index) + .template data() + + lists_col.offset(); + auto const& data_col = lists_col.child(lists_column_view::child_column_index); + + // The indices of the entries within the source list. + auto const start_idx = list_offsets[idx]; + auto const end_idx = list_offsets[idx + 1]; + + // Fill the validities array. + for (auto read_idx = start_idx, write_idx = write_start; read_idx < end_idx; + ++read_idx, ++write_idx) { + d_validities[write_idx] = static_cast(data_col.is_valid(read_idx)); + } + // Do a copy for the entire list entries. + auto const input_ptr = + reinterpret_cast(data_col.template data() + start_idx); + auto const output_ptr = reinterpret_cast(&d_output[write_start]); + thrust::copy( + thrust::seq, input_ptr, input_ptr + sizeof(T) * (end_idx - start_idx), output_ptr); + write_start += end_idx - start_idx; + }); + }); + + auto [null_mask, null_count] = cudf::detail::valid_if( + validities.begin(), validities.end(), thrust::identity{}, stream, mr); + if (null_count > 0) { output->set_null_mask(null_mask, null_count); } + + return output; + } + + template + std::enable_if_t and not cudf::is_fixed_width(), + std::unique_ptr> + operator()(table_view const&, + column_view const&, + size_type, + size_type, + rmm::cuda_stream_view, + rmm::mr::device_memory_resource*) const + { + // Currently, only support string_view and fixed-width types + CUDF_FAIL("Called `concatenate_lists_fn()` on non-supported types."); + } +}; + +std::unique_ptr concatenate_with_nullifying_rows(table_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + // Generate offsets of the output lists column. + auto [list_offsets, list_validities] = generate_list_offsets_and_validities(input, stream, mr); + auto const offsets_view = list_offsets->view(); + + // Copy entries from the input lists columns to the output lists column - this needed to be + // specialized for different types. + auto const num_output_lists = input.num_rows(); + auto const num_output_entries = + cudf::detail::get_value(offsets_view, num_output_lists, stream); + auto list_entries = + type_dispatcher(lists_column_view(*input.begin()).child().type(), + concatenate_lists_fn{}, + input, + offsets_view, + num_output_lists, + num_output_entries, + stream, + mr); + + auto [null_mask, null_count] = cudf::detail::valid_if( + list_validities.begin(), list_validities.end(), thrust::identity{}, stream, mr); + return make_lists_column(num_output_lists, + std::move(list_offsets), + std::move(list_entries), + null_count, + null_count ? std::move(null_mask) : rmm::device_buffer{}, + stream, + mr); +} + +} // namespace + +/** + * @copydoc cudf::lists::concatenate_rows + * + * @param stream CUDA stream used for device memory operations and kernel launches. + */ +std::unique_ptr concatenate_rows(table_view const& input, + concatenate_null_policy null_policy, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + CUDF_EXPECTS(input.num_columns() > 0, "The input table must have at least one column."); + + auto const entry_type = lists_column_view(*input.begin()).child().type(); + for (auto const& col : input) { + CUDF_EXPECTS(col.type().id() == type_id::LIST, + "All columns of the input table must be of lists column type."); + + auto const child_col = lists_column_view(col).child(); + CUDF_EXPECTS(not cudf::is_nested(child_col.type()), "Nested types are not supported."); + CUDF_EXPECTS(entry_type == child_col.type(), + "The types of entries in the input columns must be the same."); + } + + if (input.num_rows() == 0) { return cudf::empty_like(input.column(0)); } + if (input.num_columns() == 1) { return std::make_unique(*(input.begin()), stream, mr); } + + // List concatenation can be implemented by simply interleaving the lists columns, then modify the + // list offsets. + auto const has_null_mask = std::any_of( + std::cbegin(input), std::cend(input), [](auto const& col) { return col.nullable(); }); + if (not has_null_mask or null_policy == concatenate_null_policy::IGNORE) { + return concatenate_rows_ignore_null(input, has_null_mask, stream, mr); + } + + // Both conditions satisfied: has_null_mask == true and + // null_policy == NULLIFY_OUTPUT_ROW. + return concatenate_with_nullifying_rows(input, stream, mr); +} + +} // namespace detail + +/** + * @copydoc cudf::lists::concatenate_rows + */ +std::unique_ptr concatenate_rows(table_view const& lists_columns, + concatenate_null_policy null_policy, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::concatenate_rows(lists_columns, null_policy, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 7a3e14b4f12..6dd50592274 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -397,10 +397,11 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp) ################################################################################################### # - lists tests ---------------------------------------------------------------------------------- ConfigureTest(LISTS_TEST + lists/concatenate_rows_tests.cpp lists/contains_tests.cpp lists/count_elements_tests.cpp - lists/explode_tests.cpp lists/drop_list_duplicates_tests.cpp + lists/explode_tests.cpp lists/extract_tests.cpp lists/sort_lists_tests.cpp) diff --git a/cpp/tests/lists/concatenate_rows_tests.cpp b/cpp/tests/lists/concatenate_rows_tests.cpp new file mode 100644 index 00000000000..9c4329677e1 --- /dev/null +++ b/cpp/tests/lists/concatenate_rows_tests.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include + +namespace { +using StrListsCol = cudf::test::lists_column_wrapper; +using IntListsCol = cudf::test::lists_column_wrapper; +using IntCol = cudf::test::fixed_width_column_wrapper; +using TView = cudf::table_view; + +constexpr bool print_all{false}; // For debugging +constexpr int32_t null{0}; + +auto all_nulls() { return cudf::test::iterator_all_nulls(); } + +auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); } + +auto null_at(std::vector const& indices) +{ + return cudf::test::iterator_with_null_at(cudf::host_span{indices}); +} + +} // namespace + +struct ListConcatenateRowsTest : public cudf::test::BaseFixture { +}; + +TEST_F(ListConcatenateRowsTest, InvalidInput) +{ + // Empty input table + EXPECT_THROW(cudf::lists::concatenate_rows(TView{}), cudf::logic_error); + + // Input table contains non-list column + { + auto const col1 = IntCol{}.release(); + auto const col2 = IntListsCol{}.release(); + EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}), + cudf::logic_error); + } + + // Types mismatch + { + auto const col1 = IntListsCol{}.release(); + auto const col2 = StrListsCol{}.release(); + EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}), + cudf::logic_error); + } + + // Nested types are not supported + { + auto const col = IntListsCol{{IntListsCol{1, 2, 3}, IntListsCol{4, 5, 6}}}.release(); + EXPECT_THROW(cudf::lists::concatenate_rows(TView{{col->view(), col->view()}}), + cudf::logic_error); + } +} + +template +struct ListConcatenateRowsTypedTest : public cudf::test::BaseFixture { +}; + +using TypesForTest = cudf::test::Concat; +TYPED_TEST_CASE(ListConcatenateRowsTypedTest, TypesForTest); + +TYPED_TEST(ListConcatenateRowsTypedTest, ConcatenateEmptyColumns) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto const col = ListsCol{}.release(); + auto const results = cudf::lists::concatenate_rows(TView{{col->view(), col->view()}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *results, print_all); +} + +TYPED_TEST(ListConcatenateRowsTypedTest, ConcatenateOneColumnNotNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto const col = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release(); + auto const results = cudf::lists::concatenate_rows(TView{{col->view()}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *results, print_all); +} + +TYPED_TEST(ListConcatenateRowsTypedTest, ConcatenateOneColumnWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto const col = ListsCol{{ListsCol{{1, 2, null}, null_at(2)}, + ListsCol{} /*NULL*/, + ListsCol{{null, 3, 4, 4, 4, 4}, null_at(0)}, + ListsCol{5, 6}}, + null_at(1)} + .release(); + auto const results = cudf::lists::concatenate_rows(TView{{col->view()}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*col, *results, print_all); +} + +TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto const col1 = ListsCol{{1, 2}, {3, 4}, {5, 6}}.release(); + auto const col2 = ListsCol{{7, 8}, {9, 10}, {11, 12}}.release(); + auto const expected = ListsCol{{1, 2, 7, 8}, {3, 4, 9, 10}, {5, 6, 11, 12}}.release(); + auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); +} + +TEST_F(ListConcatenateRowsTest, SimpleInputStringsColumnsNoNull) +{ + auto const col1 = StrListsCol{ + StrListsCol{"Tomato", "Apple"}, + StrListsCol{"Banana", "Kiwi", "Cherry"}, + StrListsCol{ + "Coconut"}}.release(); + auto const col2 = + StrListsCol{StrListsCol{"Orange"}, StrListsCol{"Lemon", "Peach"}, StrListsCol{}}.release(); + auto const expected = StrListsCol{ + StrListsCol{"Tomato", "Apple", "Orange"}, + StrListsCol{"Banana", "Kiwi", "Cherry", "Lemon", "Peach"}, + StrListsCol{ + "Coconut"}}.release(); + auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); +} + +TYPED_TEST(ListConcatenateRowsTypedTest, SimpleInputWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto const col1 = ListsCol{{ListsCol{{1, null, 3, 4}, null_at(1)}, + ListsCol{{null, 2, 3, 4}, null_at(0)}, + ListsCol{{null, 2, 3, 4}, null_at(0)}, + ListsCol{} /*NULL*/, + ListsCol{{1, 2, null, 4}, null_at(2)}, + ListsCol{{1, 2, 3, null}, null_at(3)}}, + null_at(3)} + .release(); + auto const col2 = ListsCol{{ListsCol{{10, 11, 12, null}, null_at(3)}, + ListsCol{{13, 14, 15, 16, 17, null}, null_at(5)}, + ListsCol{} /*NULL*/, + ListsCol{{null, 18}, null_at(0)}, + ListsCol{{19, 20, null}, null_at(2)}, + ListsCol{{null}, null_at(0)}}, + null_at(2)} + .release(); + auto const col3 = ListsCol{{ListsCol{} /*NULL*/, + ListsCol{{20, null}, null_at(1)}, + ListsCol{{null, 21, null, null}, null_at({0, 2, 3})}, + ListsCol{}, + ListsCol{22, 23, 24, 25}, + ListsCol{{null, null, null, null, null}, all_nulls()}}, + null_at(0)} + .release(); + + // Ignore null list elements + { + auto const results = + cudf::lists::concatenate_rows(TView{{col1->view(), col2->view(), col3->view()}}); + auto const expected = + ListsCol{ListsCol{{1, null, 3, 4, 10, 11, 12, null}, null_at({1, 7})}, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{{null, 2, 3, 4, null, 21, null, null}, null_at({0, 4, 6, 7})}, + ListsCol{{null, 18}, null_at(0)}, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}} + .release(); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); + } + + // Null list rows result in null list rows + { + auto const results = + cudf::lists::concatenate_rows(TView{{col1->view(), col2->view(), col3->view()}}, + cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = + ListsCol{{ListsCol{} /*NULL*/, + ListsCol{{null, 2, 3, 4, 13, 14, 15, 16, 17, null, 20, null}, null_at({0, 9, 11})}, + ListsCol{} /*NULL*/, + ListsCol{} /*NULL*/, + ListsCol{{1, 2, null, 4, 19, 20, null, 22, 23, 24, 25}, null_at({2, 6})}, + ListsCol{{1, 2, 3, null, null, null, null, null, null, null}, + null_at({3, 4, 5, 6, 7, 8, 9})}}, + null_at({0, 2, 3})} + .release(); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); + } +} + +TEST_F(ListConcatenateRowsTest, SimpleInputStringsColumnsWithNulls) +{ + auto const col1 = StrListsCol{ + StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)}, + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{ + "Coconut"}}.release(); + auto const col2 = + StrListsCol{ + {StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}, + StrListsCol{"Lemon", "Peach"}, + StrListsCol{{"Deer" /*NULL*/, "Snake" /*NULL*/, "Horse" /*NULL*/}, all_nulls()}}, /*NULL*/ + null_at(2)} + .release(); + + // Ignore null list elements + { + auto const results = cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}); + auto const expected = StrListsCol{ + StrListsCol{{"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({1, 4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"}, + null_at({1, 4})}, + StrListsCol{ + "Coconut"}}.release(); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); + } + + // Null list rows result in null list rows + { + auto const results = + cudf::lists::concatenate_rows(TView{{col1->view(), col2->view()}}, + cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = + StrListsCol{ + {StrListsCol{ + {"Tomato", "" /*NULL*/, "Apple", "Orange", "" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, + null_at({1, 4, 5, 6})}, + StrListsCol{{"Banana", "" /*NULL*/, "Kiwi", "Cherry", "" /*NULL*/, "Lemon", "Peach"}, + null_at({1, 4})}, + StrListsCol{""} /*NULL*/}, + null_at(2)} + .release(); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); + } +} + +TYPED_TEST(ListConcatenateRowsTypedTest, SlicedColumnsInputNoNull) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto const col_original = ListsCol{{1, 2, 3}, {2, 3}, {3, 4, 5, 6}, {5, 6}, {}, {7}}.release(); + auto const col1 = cudf::slice(col_original->view(), {0, 3})[0]; + auto const col2 = cudf::slice(col_original->view(), {1, 4})[0]; + auto const col3 = cudf::slice(col_original->view(), {2, 5})[0]; + auto const col4 = cudf::slice(col_original->view(), {3, 6})[0]; + auto const expected = ListsCol{ + {1, 2, 3, 2, 3, 3, 4, 5, 6, 5, 6}, + {2, 3, 3, 4, 5, 6, 5, 6}, + {3, 4, 5, 6, 5, 6, 7}}.release(); + auto const results = cudf::lists::concatenate_rows(TView{{col1, col2, col3, col4}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); +} + +TYPED_TEST(ListConcatenateRowsTypedTest, SlicedColumnsInputWithNulls) +{ + using ListsCol = cudf::test::lists_column_wrapper; + + auto const col_original = ListsCol{{ListsCol{{null, 2, 3}, null_at(0)}, + ListsCol{2, 3}, /*NULL*/ + ListsCol{{3, null, 5, 6}, null_at(1)}, + ListsCol{5, 6}, /*NULL*/ + ListsCol{}, /*NULL*/ + ListsCol{7}, + ListsCol{8, 9, 10}}, + null_at({1, 3, 4})} + .release(); + auto const col1 = cudf::slice(col_original->view(), {0, 3})[0]; + auto const col2 = cudf::slice(col_original->view(), {1, 4})[0]; + auto const col3 = cudf::slice(col_original->view(), {2, 5})[0]; + auto const col4 = cudf::slice(col_original->view(), {3, 6})[0]; + auto const col5 = cudf::slice(col_original->view(), {4, 7})[0]; + auto const expected = ListsCol{ + ListsCol{{null, 2, 3, 3, null, 5, 6}, null_at({0, 4})}, + ListsCol{{3, null, 5, 6, 7}, null_at(1)}, + ListsCol{{3, null, 5, 6, 7, 8, 9, 10}, + null_at(1)}}.release(); + auto const results = cudf::lists::concatenate_rows(TView{{col1, col2, col3, col4, col5}}); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); +} + +TEST_F(ListConcatenateRowsTest, SlicedStringsColumnsInputWithNulls) +{ + auto const col = + StrListsCol{ + {StrListsCol{{"Tomato", "Bear" /*NULL*/, "Apple"}, null_at(1)}, + StrListsCol{{"Banana", "Pig" /*NULL*/, "Kiwi", "Cherry", "Whale" /*NULL*/}, null_at({1, 4})}, + StrListsCol{"Coconut"}, + StrListsCol{{"Orange", "Dog" /*NULL*/, "Fox" /*NULL*/, "Duck" /*NULL*/}, null_at({1, 2, 3})}, + StrListsCol{"Lemon", "Peach"}, + StrListsCol{{"Deer" /*NULL*/, "Snake" /*NULL*/, "Horse" /*NULL*/}, all_nulls()}}, /*NULL*/ + null_at(5)} + .release(); + auto const col1 = cudf::slice(col->view(), {0, 3})[0]; + auto const col2 = cudf::slice(col->view(), {1, 4})[0]; + auto const col3 = cudf::slice(col->view(), {2, 5})[0]; + auto const col4 = cudf::slice(col->view(), {3, 6})[0]; + + { + auto const results = cudf::lists::concatenate_rows(TView{{col1, col2, col3, col4}}); + auto const expected = StrListsCol{ + StrListsCol{{"Tomato", + "" /*NULL*/, + "Apple", + "Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/}, + null_at({1, 4, 7, 10, 11, 12})}, + StrListsCol{{"Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({1, 4, 7, 8, 9})}, + StrListsCol{ + { + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach", + }, + null_at({2, 3, 4})}}.release(); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); + } + + { + auto const results = cudf::lists::concatenate_rows( + TView{{col1, col2, col3, col4}}, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW); + auto const expected = StrListsCol{{StrListsCol{{"Tomato", + "" /*NULL*/, + "Apple", + "Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "" /*NULL*/}, + null_at({1, 4, 7, 10, 11, 12})}, + StrListsCol{{"Banana", + "" /*NULL*/, + "Kiwi", + "Cherry", + "" /*NULL*/, + "Coconut", + "Orange", + "" /*NULL*/, + "" /*NULL*/, + "", /*NULL*/ + "Lemon", + "Peach"}, + null_at({1, 4, 7, 8, 9})}, + StrListsCol{} /*NULL*/}, + null_at(2)} + .release(); + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *results, print_all); + } +}