Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Implement lists::distinct and cudf::detail::stable_distinct #11149

Merged
merged 29 commits into from
Jul 11, 2022
Merged
Show file tree
Hide file tree
Changes from 26 commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
1d7e8e0
Add new implementation and test files
ttnghia Jun 24, 2022
51b80db
Fix compile error
ttnghia Jun 24, 2022
08a76ad
Rename function
ttnghia Jun 27, 2022
16101f7
Implement `cudf::detail::stable_distinct` and `lists::distinct`
ttnghia Jun 27, 2022
5ec13d6
Rewrite doxygen
ttnghia Jun 27, 2022
6c5b738
Rename variable
ttnghia Jun 27, 2022
5b70eee
Rewrite comment
ttnghia Jun 27, 2022
238248d
Rename files
ttnghia Jun 27, 2022
ba6bf6b
Implement float tests
ttnghia Jun 27, 2022
3845c95
Implement string tests
ttnghia Jun 27, 2022
507c82d
Implement tests for `ListDistinctTypedTest`
ttnghia Jun 28, 2022
2cb8347
Complete the remaining tests
ttnghia Jun 28, 2022
7efdea0
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jun 28, 2022
4388637
Rewrite doxygen
ttnghia Jun 28, 2022
4dd5e74
Misc
ttnghia Jun 28, 2022
3b0760c
Misc
ttnghia Jun 28, 2022
9730b70
Rewrite test
ttnghia Jun 28, 2022
9bd9b6f
Fix doxygen
ttnghia Jun 28, 2022
790a482
Fix header
ttnghia Jun 28, 2022
1c58baa
Rewrite doxygen
ttnghia Jun 28, 2022
d493c4f
Rewrite doxygen and fix headers
ttnghia Jun 28, 2022
d090d2a
Fix iterator type
ttnghia Jun 30, 2022
ee51822
Rewrite doxygen
ttnghia Jun 30, 2022
ccdd6f0
Add empty lines
ttnghia Jun 30, 2022
034ee2a
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jun 30, 2022
b1231a2
Update default stream
ttnghia Jun 30, 2022
af91b80
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jul 5, 2022
86c9ba8
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jul 8, 2022
99d70b1
Handle empty input
ttnghia Jul 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,6 @@ add_library(
src/join/mixed_join_size_kernel_nulls.cu
src/join/mixed_join_size_kernels_semi.cu
src/join/semi_join.cu
src/lists/apply_boolean_mask.cu
src/lists/contains.cu
src/lists/combine/concatenate_list_elements.cu
src/lists/combine/concatenate_rows.cu
Expand All @@ -387,6 +386,9 @@ add_library(
src/lists/lists_column_view.cu
src/lists/segmented_sort.cu
src/lists/sequences.cu
src/lists/stream_compaction/apply_boolean_mask.cu
src/lists/stream_compaction/distinct.cu
src/lists/utilities.cu
src/merge/merge.cu
src/partitioning/partitioning.cu
src/partitioning/round_robin.cu
Expand Down Expand Up @@ -450,6 +452,7 @@ add_library(
src/stream_compaction/distinct_reduce.cu
src/stream_compaction/drop_nans.cu
src/stream_compaction/drop_nulls.cu
src/stream_compaction/stable_distinct.cu
src/stream_compaction/unique.cu
src/stream_compaction/unique_count.cu
src/strings/attributes.cu
Expand Down
30 changes: 30 additions & 0 deletions cpp/include/cudf/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/utilities/default_stream.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
PointKernel marked this conversation as resolved.
Show resolved Hide resolved
PointKernel marked this conversation as resolved.
Show resolved Hide resolved

namespace cudf {
namespace detail {
Expand Down Expand Up @@ -89,6 +90,35 @@ std::unique_ptr<table> distinct(
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a new table without duplicate rows.
*
* Given an `input` table_view, each row is copied to the output table to create a set of distinct
* rows. The row order is guaranteed to be preserved as in the input.
*
* If there are duplicate rows, which row to be copied depends on the specified value of the `keep`
* parameter.
*
* This API produces exactly the same set of output rows as `cudf::distinct`.
*
* @param input The input table
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
* @param keys Vector of indices indicating key columns in the `input` table
* @param keep Copy any, first, last, or none of the found duplicates
* @param nulls_equal Flag to specify whether null elements should be considered as equal
* @param nans_equal Flag to specify whether NaN elements should be considered as equal
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned table
* @return A table containing the resulting distinct rows
*/
std::unique_ptr<table> stable_distinct(
table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a column of indices of all distinct rows in the input table.
*
Expand Down
12 changes: 12 additions & 0 deletions cpp/include/cudf/lists/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,16 @@ std::unique_ptr<column> apply_boolean_mask(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::list::distinct
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> distinct(
lists_column_view const& input,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace cudf::lists::detail
25 changes: 25 additions & 0 deletions cpp/include/cudf/lists/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,29 @@ std::unique_ptr<column> apply_boolean_mask(
lists_column_view const& boolean_mask,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a new list column without duplicate elements in each list.
*
* Given a lists column `input`, distinct elements of each list are copied to the corresponding
* output list. The order of lists is preserved while the order of elements within each list is not
* guaranteed.
*
* Example:
* @code{.pseudo}
* input = { {0, 1, 2, 3, 2}, {3, 1, 2}, null, {4, null, null, 5} }
* result = { {0, 1, 2, 3}, {3, 1, 2}, null, {4, null, 5} }
* @endcode
*
* @param input The input lists column
* @param nulls_equal Flag to specify whether null elements should be considered as equal
* @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
* @param mr Device memory resource used to allocate the returned object
* @return The resulting lists column containing lists without duplicates
*/
std::unique_ptr<column> distinct(
lists_column_view const& input,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace cudf::lists
81 changes: 81 additions & 0 deletions cpp/src/lists/stream_compaction/distinct.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <lists/utilities.hpp>

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/lists/lists_column_view.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>

#include <rmm/cuda_stream_view.hpp>

ttnghia marked this conversation as resolved.
Show resolved Hide resolved
#include <memory>
#include <utility>

namespace cudf::lists {
namespace detail {

std::unique_ptr<column> distinct(lists_column_view const& input,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Algorithm:
// - Generate labels for the child elements.
// - Get distinct rows of the table {labels, child} using `stable_distinct`.
// - Build the output lists column from the output distinct rows above.

auto const child = input.get_sliced_child(stream);
auto const labels = generate_labels(input, child.size(), stream);

auto const distinct_table =
cudf::detail::stable_distinct(table_view{{labels->view(), child}}, // input table
std::vector<size_type>{0, 1}, // keys
duplicate_keep_option::KEEP_ANY,
nulls_equal,
nans_equal,
stream,
mr);

auto out_offsets =
reconstruct_offsets(distinct_table->get_column(0).view(), input.size(), stream, mr);

return make_lists_column(input.size(),
std::move(out_offsets),
std::move(distinct_table->release().back()),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
stream,
mr);
}

} // namespace detail

std::unique_ptr<column> distinct(lists_column_view const& input,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::distinct(input, nulls_equal, nans_equal, cudf::default_stream_value, mr);
}

} // namespace cudf::lists
55 changes: 55 additions & 0 deletions cpp/src/lists/utilities.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include "utilities.hpp"

#include <cudf/column/column_factories.hpp>
#include <cudf/detail/labeling/label_segments.cuh>

namespace cudf::lists::detail {

std::unique_ptr<column> generate_labels(lists_column_view const& input,
size_type n_elements,
rmm::cuda_stream_view stream)
{
auto labels = make_numeric_column(
data_type(type_to_id<size_type>()), n_elements, cudf::mask_state::UNALLOCATED, stream);
auto const labels_begin = labels->mutable_view().template begin<size_type>();
cudf::detail::label_segments(
input.offsets_begin(), input.offsets_end(), labels_begin, labels_begin + n_elements, stream);
return labels;
}

std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
size_type n_lists,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)

{
auto out_offsets = make_numeric_column(
data_type{type_to_id<offset_type>()}, n_lists + 1, mask_state::UNALLOCATED, stream, mr);

auto const labels_begin = labels.template begin<size_type>();
auto const offsets_begin = out_offsets->mutable_view().template begin<offset_type>();
cudf::detail::labels_to_offsets(labels_begin,
labels_begin + labels.size(),
offsets_begin,
offsets_begin + out_offsets->size(),
stream);
return out_offsets;
}

} // namespace cudf::lists::detail
53 changes: 53 additions & 0 deletions cpp/src/lists/utilities.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,53 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#pragma once

#include <cudf/column/column_view.hpp>
#include <cudf/lists/lists_column_view.hpp>

#include <rmm/cuda_stream_view.hpp>
ttnghia marked this conversation as resolved.
Show resolved Hide resolved
#include <rmm/mr/device/device_memory_resource.hpp>

namespace cudf::lists::detail {

/**
* @brief Generate list labels for elements in the child column of the input lists column.
*
* @param input The input lists column
* @param n_elements The number of elements in the child column of the input lists column
* @param stream CUDA stream used for device memory operations and kernel launches
* @return A column containing list labels corresponding to each element in the child column
*/
std::unique_ptr<column> generate_labels(lists_column_view const& input,
size_type n_elements,
rmm::cuda_stream_view stream);

/**
* @brief Reconstruct an offsets column from the input list labels column.
*
* @param labels The list labels corresponding to each list element
* @param n_lists The number of lists to build the offsets column
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned object
* @return The output offsets column
*/
std::unique_ptr<column> reconstruct_offsets(column_view const& labels,
size_type n_lists,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr);

} // namespace cudf::lists::detail
66 changes: 66 additions & 0 deletions cpp/src/stream_compaction/stable_distinct.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
/*
* Copyright (c) 2022, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/detail/copy_if.cuh>
#include <cudf/detail/stream_compaction.hpp>
#include <cudf/table/table.hpp>
#include <cudf/table/table_view.hpp>
#include <cudf/types.hpp>

#include <thrust/iterator/constant_iterator.h>
#include <thrust/scatter.h>
#include <thrust/uninitialized_fill.h>

namespace cudf::detail {

std::unique_ptr<table> stable_distinct(table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (input.num_rows() == 0 or input.num_columns() == 0 or keys.empty()) {
return empty_like(input);
}

auto const distinct_indices =
get_distinct_indices(input.select(keys), keep, nulls_equal, nans_equal, stream);

// Markers to denote which rows to be copied to the output.
auto const output_markers = [&] {
auto markers = rmm::device_uvector<bool>(input.num_rows(), stream);
thrust::uninitialized_fill(rmm::exec_policy(stream), markers.begin(), markers.end(), false);
thrust::scatter(
rmm::exec_policy(stream),
thrust::constant_iterator<bool>(true, 0),
thrust::constant_iterator<bool>(true, static_cast<size_type>(distinct_indices.size())),
distinct_indices.begin(),
markers.begin());
return markers;
}();

return cudf::detail::copy_if(
input,
[output_markers = output_markers.begin()] __device__(auto const idx) {
return *(output_markers + idx);
},
stream,
mr);
}

} // namespace cudf::detail
3 changes: 2 additions & 1 deletion cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -471,7 +471,6 @@ ConfigureTest(AST_TEST ast/transform_tests.cpp)
# * lists tests ----------------------------------------------------------------------------------
ConfigureTest(
LISTS_TEST
lists/apply_boolean_mask_test.cpp
lists/combine/concatenate_list_elements_tests.cpp
lists/combine/concatenate_rows_tests.cpp
lists/contains_tests.cpp
Expand All @@ -481,6 +480,8 @@ ConfigureTest(
lists/extract_tests.cpp
lists/sequences_tests.cpp
lists/sort_lists_tests.cpp
lists/stream_compaction/apply_boolean_mask_tests.cpp
lists/stream_compaction/distinct_tests.cpp
)

# ##################################################################################################
Expand Down
Loading