Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Fully support nested types in lists::drop_list_duplicates #11224

Closed
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
31 commits
Select commit Hold shift + click to select a range
1d7e8e0
Add new implementation and test files
ttnghia Jun 24, 2022
51b80db
Fix compile error
ttnghia Jun 24, 2022
08a76ad
Rename function
ttnghia Jun 27, 2022
16101f7
Implement `cudf::detail::stable_distinct` and `lists::distinct`
ttnghia Jun 27, 2022
5ec13d6
Rewrite doxygen
ttnghia Jun 27, 2022
6c5b738
Rename variable
ttnghia Jun 27, 2022
5b70eee
Rewrite comment
ttnghia Jun 27, 2022
238248d
Rename files
ttnghia Jun 27, 2022
ba6bf6b
Implement float tests
ttnghia Jun 27, 2022
3845c95
Implement string tests
ttnghia Jun 27, 2022
507c82d
Implement tests for `ListDistinctTypedTest`
ttnghia Jun 28, 2022
2cb8347
Complete the remaining tests
ttnghia Jun 28, 2022
7efdea0
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jun 28, 2022
4388637
Rewrite doxygen
ttnghia Jun 28, 2022
4dd5e74
Misc
ttnghia Jun 28, 2022
3b0760c
Misc
ttnghia Jun 28, 2022
9730b70
Rewrite test
ttnghia Jun 28, 2022
9bd9b6f
Fix doxygen
ttnghia Jun 28, 2022
790a482
Fix header
ttnghia Jun 28, 2022
1c58baa
Rewrite doxygen
ttnghia Jun 28, 2022
d493c4f
Rewrite doxygen and fix headers
ttnghia Jun 28, 2022
d090d2a
Fix iterator type
ttnghia Jun 30, 2022
ee51822
Rewrite doxygen
ttnghia Jun 30, 2022
ccdd6f0
Add empty lines
ttnghia Jun 30, 2022
034ee2a
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jun 30, 2022
b1231a2
Update default stream
ttnghia Jun 30, 2022
af91b80
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jul 5, 2022
86c9ba8
Merge branch 'branch-22.08' into add_lists_distinct
ttnghia Jul 8, 2022
99d70b1
Handle empty input
ttnghia Jul 8, 2022
c08b8d8
Merge branch 'add_lists_distinct' into reimplement_drop_list_duplicates
ttnghia Jul 8, 2022
5110657
Reimplementt `drop_list_duplicates`
ttnghia Jul 8, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
5 changes: 4 additions & 1 deletion cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -369,7 +369,6 @@ add_library(
src/join/mixed_join_size_kernel_nulls.cu
src/join/mixed_join_size_kernels_semi.cu
src/join/semi_join.cu
src/lists/apply_boolean_mask.cu
src/lists/contains.cu
src/lists/combine/concatenate_list_elements.cu
src/lists/combine/concatenate_rows.cu
Expand All @@ -387,6 +386,9 @@ add_library(
src/lists/lists_column_view.cu
src/lists/segmented_sort.cu
src/lists/sequences.cu
src/lists/stream_compaction/apply_boolean_mask.cu
src/lists/stream_compaction/distinct.cu
src/lists/utilities.cu
src/merge/merge.cu
src/partitioning/partitioning.cu
src/partitioning/round_robin.cu
Expand Down Expand Up @@ -452,6 +454,7 @@ add_library(
src/stream_compaction/distinct_reduce.cu
src/stream_compaction/drop_nans.cu
src/stream_compaction/drop_nulls.cu
src/stream_compaction/stable_distinct.cu
src/stream_compaction/unique.cu
src/stream_compaction/unique_count.cu
src/strings/attributes.cu
Expand Down
30 changes: 30 additions & 0 deletions cpp/include/cudf/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
#include <cudf/utilities/default_stream.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>

namespace cudf {
namespace detail {
Expand Down Expand Up @@ -89,6 +90,35 @@ std::unique_ptr<table> distinct(
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a new table without duplicate rows.
*
* Given an `input` table_view, each row is copied to the output table to create a set of distinct
* rows. The row order is guaranteed to be preserved as in the input.
*
* If there are duplicate rows, which row to be copied depends on the specified value of the `keep`
* parameter.
*
* This API produces exactly the same set of output rows as `cudf::distinct`.
*
* @param input The input table
* @param keys Vector of indices indicating key columns in the `input` table
* @param keep Copy any, first, last, or none of the found duplicates
* @param nulls_equal Flag to specify whether null elements should be considered as equal
* @param nans_equal Flag to specify whether NaN elements should be considered as equal
* @param stream CUDA stream used for device memory operations and kernel launches
* @param mr Device memory resource used to allocate the returned table
* @return A table containing the resulting distinct rows
*/
std::unique_ptr<table> stable_distinct(
table_view const& input,
std::vector<size_type> const& keys,
duplicate_keep_option keep = duplicate_keep_option::KEEP_ANY,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::cuda_stream_view stream = cudf::default_stream_value,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a column of indices of all distinct rows in the input table.
*
Expand Down
12 changes: 12 additions & 0 deletions cpp/include/cudf/lists/detail/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -34,4 +34,16 @@ std::unique_ptr<column> apply_boolean_mask(
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @copydoc cudf::list::distinct
*
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
std::unique_ptr<column> distinct(
lists_column_view const& input,
null_equality nulls_equal,
nan_equality nans_equal,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace cudf::lists::detail
25 changes: 25 additions & 0 deletions cpp/include/cudf/lists/stream_compaction.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -55,4 +55,29 @@ std::unique_ptr<column> apply_boolean_mask(
lists_column_view const& boolean_mask,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Create a new list column without duplicate elements in each list.
*
* Given a lists column `input`, distinct elements of each list are copied to the corresponding
* output list. The order of lists is preserved while the order of elements within each list is not
* guaranteed.
*
* Example:
* @code{.pseudo}
* input = { {0, 1, 2, 3, 2}, {3, 1, 2}, null, {4, null, null, 5} }
* result = { {0, 1, 2, 3}, {3, 1, 2}, null, {4, null, 5} }
* @endcode
*
* @param input The input lists column
* @param nulls_equal Flag to specify whether null elements should be considered as equal
* @param nans_equal Flag to specify whether floating-point NaNs should be considered as equal
* @param mr Device memory resource used to allocate the returned object
* @return The resulting lists column containing lists without duplicates
*/
std::unique_ptr<column> distinct(
lists_column_view const& input,
null_equality nulls_equal = null_equality::EQUAL,
nan_equality nans_equal = nan_equality::ALL_EQUAL,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

} // namespace cudf::lists
Loading