Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add libcudf lists column count_elements API #7173

Merged
merged 7 commits into from
Jan 25, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@ test:
- test -f $PREFIX/include/cudf/join.hpp
- test -f $PREFIX/include/cudf/lists/detail/concatenate.hpp
- test -f $PREFIX/include/cudf/lists/detail/copying.hpp
- test -f $PREFIX/include/cudf/lists/count_elements.hpp
- test -f $PREFIX/include/cudf/lists/extract.hpp
- test -f $PREFIX/include/cudf/lists/gather.hpp
- test -f $PREFIX/include/cudf/lists/lists_column_view.hpp
Expand Down
56 changes: 56 additions & 0 deletions cpp/include/cudf/lists/count_elements.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,56 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/column/column.hpp>
#include <cudf/lists/lists_column_view.hpp>

namespace cudf {
namespace lists {
/**
* @addtogroup lists_elements
* @{
* @file
*/

/**
* @brief Returns a numeric column containing the number of rows in
* each list element in the given lists column.
*
* The output column will have the same number of rows as the
* input lists column. Each `output[i]` will be `input[i].size()`.
*
* @code{.pseudo}
* l = { {1, 2, 3}, {4}, {5, 6} }
* r = count_elements(l)
* r is now {3, 1, 2}
* @endcode
*
* Any null input element will result in a corresponding null entry
* in the output column.
*
* @param input Input lists column.
* @param mr Device memory resource used to allocate the returned column's device memory.
* @return New INT32 column with the number of elements for each row.
*/
std::unique_ptr<column> count_elements(
lists_column_view const& input,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of lists_elements group

} // namespace lists
} // namespace cudf
3 changes: 2 additions & 1 deletion cpp/include/doxygen_groups.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020, NVIDIA CORPORATION.
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -143,6 +143,7 @@
* @defgroup lists_apis Lists
* @{
* @defgroup lists_extract Extracting
* @defgroup lists_elements Counting
* @}
* @defgroup nvtext_apis NVText
* @{
Expand Down
87 changes: 87 additions & 0 deletions cpp/src/lists/count_elements.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,87 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/lists/count_elements.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_vector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/transform.h>
#include <thrust/transform_scan.h>

namespace cudf {
namespace lists {
namespace detail {
/**
* @brief Returns a numeric column containing lengths of each element.
*
* @param input Input lists column.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device memory resource used to allocate the returned column's device memory
* @return New INT32 column with lengths.
*/
std::unique_ptr<column> count_elements(lists_column_view const& input,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto device_column = cudf::column_device_view::create(input.parent(), stream);
auto d_column = *device_column;
// create output column
auto output = make_fixed_width_column(data_type{type_to_id<size_type>()},
input.size(),
copy_bitmask(input.parent()),
input.null_count(),
stream,
mr);

// fill in the sizes
thrust::transform(rmm::exec_policy(stream),
thrust::make_counting_iterator<cudf::size_type>(0),
thrust::make_counting_iterator<cudf::size_type>(input.size()),
output->mutable_view().begin<size_type>(),
[d_column] __device__(size_type idx) {
if (d_column.is_null(idx)) return size_type{0};
auto d_offsets =
d_column.child(lists_column_view::offsets_column_index).data<size_type>() +
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
d_column.offset();
return d_offsets[idx + 1] - d_offsets[idx];
davidwendt marked this conversation as resolved.
Show resolved Hide resolved
});

output->set_null_count(input.null_count()); // reset null count
return output;
}

} // namespace detail

// external APIS

std::unique_ptr<column> count_elements(lists_column_view const& input,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::count_elements(input, rmm::cuda_stream_default, mr);
}

} // namespace lists
} // namespace cudf
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -653,6 +653,7 @@ ConfigureTest(AST_TEST "${AST_TEST_SRC}")
# - lists tests ----------------------------------------------------------------------------------

set(LISTS_TEST_SRC
"${CMAKE_CURRENT_SOURCE_DIR}/lists/count_elements_tests.cpp"
"${CMAKE_CURRENT_SOURCE_DIR}/lists/extract_tests.cpp")

ConfigureTest(LISTS_TEST "${LISTS_TEST_SRC}")
Expand Down
100 changes: 100 additions & 0 deletions cpp/tests/lists/count_elements_tests.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,100 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/copying.hpp>
#include <cudf/lists/count_elements.hpp>

#include <cudf_test/base_fixture.hpp>
#include <cudf_test/column_utilities.hpp>
#include <cudf_test/column_wrapper.hpp>
#include <cudf_test/type_lists.hpp>

struct ListsElementsTest : public cudf::test::BaseFixture {
};

using NumericTypesNotBool =
cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;

template <typename T>
class ListsElementsNumericsTest : public ListsElementsTest {
};

TYPED_TEST_CASE(ListsElementsNumericsTest, NumericTypesNotBool);

TYPED_TEST(ListsElementsNumericsTest, CountElements)
{
auto validity = thrust::make_transform_iterator(
thrust::make_counting_iterator<cudf::size_type>(0), [](auto i) { return i != 1; });
using LCW = cudf::test::lists_column_wrapper<TypeParam>;
LCW input({LCW{3, 2, 1}, LCW{}, LCW{30, 20, 10, 50}, LCW{100, 120}, LCW{0}}, validity);

auto result = cudf::lists::count_elements(cudf::lists_column_view(input));
cudf::test::fixed_width_column_wrapper<int32_t> expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
}

TEST_F(ListsElementsTest, CountElementsStrings)
{
auto validity = thrust::make_transform_iterator(
thrust::make_counting_iterator<cudf::size_type>(0), [](auto i) { return i != 1; });
using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
LCW input(
{LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}},
validity);

auto result = cudf::lists::count_elements(cudf::lists_column_view(input));
cudf::test::fixed_width_column_wrapper<int32_t> expected({3, 0, 4, 2, 1}, {1, 0, 1, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
}

TEST_F(ListsElementsTest, CountElementsSliced)
{
auto validity = thrust::make_transform_iterator(
thrust::make_counting_iterator<cudf::size_type>(0), [](auto i) { return i != 1; });
using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
LCW input(
{LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", "", "z"}, LCW{"tést", "String"}, LCW{""}},
validity);

auto sliced = cudf::slice(input, {1, 4}).front();
auto result = cudf::lists::count_elements(cudf::lists_column_view(sliced));
cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 4, 2}, {0, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
}

TYPED_TEST(ListsElementsNumericsTest, CountElementsNestedLists)
{
std::vector<int32_t> validity{1, 0, 1, 1};
using LCW = cudf::test::lists_column_wrapper<TypeParam>;
LCW list({LCW{LCW{2, 3}, LCW{4, 5}},
LCW{LCW{}},
LCW{LCW{6, 7, 8}, LCW{9, 10, 11}, LCW({12, 13, 14}, validity.begin())},
LCW{LCW{15, 16}, LCW{17, 18}, LCW{19, 20}, LCW{21, 22}, LCW{23, 24}}},
validity.begin());

auto result = cudf::lists::count_elements(cudf::lists_column_view(list));
cudf::test::fixed_width_column_wrapper<int32_t> expected({2, 1, 3, 5}, {1, 0, 1, 1});
CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *result);
}

TEST_F(ListsElementsTest, CountElementsEmpty)
{
using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;

LCW empty{};
auto result = cudf::lists::count_elements(cudf::lists_column_view(empty));
EXPECT_EQ(0, result->size());
}