diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 51b3e8afc05..c3e84447b3a 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -122,6 +122,7 @@ test: - test -f $PREFIX/include/cudf/join.hpp - test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp - test -f $PREFIX/include/cudf/lists/detail/copying.hpp + - test -f $PREFIX/include/cudf/lists/count_elements.hpp - test -f $PREFIX/include/cudf/lists/extract.hpp - test -f $PREFIX/include/cudf/lists/gather.hpp - test -f $PREFIX/include/cudf/lists/lists_column_view.hpp diff --git a/cpp/include/cudf/lists/count_elements.hpp b/cpp/include/cudf/lists/count_elements.hpp new file mode 100644 index 00000000000..6b802d2ad5e --- /dev/null +++ b/cpp/include/cudf/lists/count_elements.hpp @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace lists { +/** + * @addtogroup lists_elements + * @{ + * @file + */ + +/** + * @brief Returns a numeric column containing the number of rows in + * each list element in the given lists column. + * + * The output column will have the same number of rows as the + * input lists column. Each `output[i]` will be `input[i].size()`. + * + * @code{.pseudo} + * l = { {1, 2, 3}, {4}, {5, 6} } + * r = count_elements(l) + * r is now {3, 1, 2} + * @endcode + * + * Any null input element will result in a corresponding null entry + * in the output column. + * + * @param input Input lists column. + * @param mr Device memory resource used to allocate the returned column's device memory. + * @return New INT32 column with the number of elements for each row. + */ +std::unique_ptr count_elements( + lists_column_view const& input, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of lists_elements group + +} // namespace lists +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index 03e00b881d8..1d796aca4b7 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -143,6 +143,7 @@ * @defgroup lists_apis Lists * @{ * @defgroup lists_extract Extracting + * @defgroup lists_elements Counting * @} * @defgroup nvtext_apis NVText * @{ diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu new file mode 100644 index 00000000000..78549152770 --- /dev/null +++ b/cpp/src/lists/count_elements.cu @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +namespace cudf { +namespace lists { +namespace detail { +/** + * @brief Returns a numeric column containing lengths of each element. + * + * @param input Input lists column. + * @param stream CUDA stream used for device memory operations and kernel launches. + * @param mr Device memory resource used to allocate the returned column's device memory + * @return New INT32 column with lengths. + */ +std::unique_ptr count_elements(lists_column_view const& input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto device_column = cudf::column_device_view::create(input.parent(), stream); + auto d_column = *device_column; + // create output column + auto output = make_fixed_width_column(data_type{type_to_id()}, + input.size(), + copy_bitmask(input.parent()), + input.null_count(), + stream, + mr); + + // fill in the sizes + thrust::transform(rmm::exec_policy(stream), + thrust::make_counting_iterator(0), + thrust::make_counting_iterator(input.size()), + output->mutable_view().begin(), + [d_column] __device__(size_type idx) { + if (d_column.is_null(idx)) return size_type{0}; + auto d_offsets = + d_column.child(lists_column_view::offsets_column_index).data() + + d_column.offset(); + return d_offsets[idx + 1] - d_offsets[idx]; + }); + + output->set_null_count(input.null_count()); // reset null count + return output; +} + +} // namespace detail + +// external APIS + +std::unique_ptr count_elements(lists_column_view const& input, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::count_elements(input, rmm::cuda_stream_default, mr); +} + +} // namespace lists +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 0d958f47b6b..106638a1a6f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -653,6 +653,7 @@ ConfigureTest(AST_TEST "${AST_TEST_SRC}") # - lists tests ---------------------------------------------------------------------------------- set(LISTS_TEST_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/lists/count_elements_tests.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/lists/extract_tests.cpp") ConfigureTest(LISTS_TEST "${LISTS_TEST_SRC}") diff --git a/cpp/tests/lists/count_elements_tests.cpp b/cpp/tests/lists/count_elements_tests.cpp new file mode 100644 index 00000000000..c5cb9d230c3 --- /dev/null +++ b/cpp/tests/lists/count_elements_tests.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include +#include +#include +#include + +struct ListsElementsTest : public cudf::test::BaseFixture { +}; + +using NumericTypesNotBool = + cudf::test::Concat; + +template +class ListsElementsNumericsTest : public ListsElementsTest { +}; + +TYPED_TEST_CASE(ListsElementsNumericsTest, NumericTypesNotBool); + +TYPED_TEST(ListsElementsNumericsTest, CountElements) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}}, validity); + + auto result = cudf::lists::count_elements(cudf::lists_column_view(input)); + cudf::test::fixed_width_column_wrapper expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(ListsElementsTest, CountElementsStrings) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}}, + validity); + + auto result = cudf::lists::count_elements(cudf::lists_column_view(input)); + cudf::test::fixed_width_column_wrapper expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(ListsElementsTest, CountElementsSliced) +{ + auto validity = thrust::make_transform_iterator( + thrust::make_counting_iterator(0), [](auto i) { return i != 1; }); + using LCW = cudf::test::lists_column_wrapper; + LCW input( + {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}}, + validity); + + auto sliced = cudf::slice(input, {1, 4}).front(); + auto result = cudf::lists::count_elements(cudf::lists_column_view(sliced)); + cudf::test::fixed_width_column_wrapper expected({0, 4, 2}, {0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TYPED_TEST(ListsElementsNumericsTest, CountElementsNestedLists) +{ + std::vector validity{1, 0, 1, 1}; + using LCW = cudf::test::lists_column_wrapper; + LCW list({LCW{LCW{2, 3}, LCW{4, 5}}, + LCW{LCW{}}, + LCW{LCW{6, 7, 8}, LCW{9, 10, 11}, LCW({12, 13, 14}, validity.begin())}, + LCW{LCW{15, 16}, LCW{17, 18}, LCW{19, 20}, LCW{21, 22}, LCW{23, 24}}}, + validity.begin()); + + auto result = cudf::lists::count_elements(cudf::lists_column_view(list)); + cudf::test::fixed_width_column_wrapper expected({2, 1, 3, 5}, {1, 0, 1, 1}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result); +} + +TEST_F(ListsElementsTest, CountElementsEmpty) +{ + using LCW = cudf::test::lists_column_wrapper; + + LCW empty{}; + auto result = cudf::lists::count_elements(cudf::lists_column_view(empty)); + EXPECT_EQ(0, result->size()); +}