Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Generate group offsets from element labels #11017

Merged
merged 67 commits into from
Jun 3, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
67 commits
Select commit Hold shift + click to select a range
41a7baa
Switch to use `generate_list_labels`
ttnghia May 23, 2022
1204e9a
Remove comments
ttnghia May 23, 2022
165f752
Switch to use 0-based list labels
ttnghia May 23, 2022
3f385eb
Implement `fill_segmented_labels`
ttnghia May 24, 2022
2684625
Move file and change file name
ttnghia May 24, 2022
0cfe856
Use `fill_segmented_labels` in groupby
ttnghia May 24, 2022
1007176
Add comment
ttnghia May 24, 2022
9b5a88d
Add example
ttnghia May 24, 2022
9903007
Rename and move file
ttnghia May 24, 2022
7bd714d
Rename variable
ttnghia May 24, 2022
031302b
Add a benchmark
ttnghia May 25, 2022
28e1463
Rewrite `label_segments`
ttnghia May 26, 2022
d3708a5
Fix compile error
ttnghia May 26, 2022
60259c9
Hack to test
ttnghia May 26, 2022
97b9a57
Revert "Add a benchmark"
ttnghia May 26, 2022
a094dab
Revert "Hack to test"
ttnghia May 26, 2022
8315764
Merge branch 'branch-22.08' into list_label
ttnghia May 26, 2022
f5a5520
Add comment
ttnghia May 26, 2022
3be6b16
Merge branch 'list_label' into offsets_from_labels
ttnghia May 26, 2022
a060e3d
Add comment clarifying bound check
ttnghia May 26, 2022
6ac3f49
Implement `labels_to_offsets`
ttnghia May 26, 2022
b8cb363
Rewrite example
ttnghia May 26, 2022
2218efa
Merge branch 'list_label' into offsets_from_labels
ttnghia May 26, 2022
9bdfa6c
Fix initialization
ttnghia May 26, 2022
baa303f
Use `label_to_offsets`
ttnghia May 26, 2022
039b92f
Change variable name
ttnghia May 26, 2022
ab1e25a
Reverse comments. They will be removed completely later on so don't c…
ttnghia May 26, 2022
f9e0819
Remove unused headers
ttnghia May 26, 2022
52f6c30
Rewrite example
ttnghia May 26, 2022
8e1f01a
Merge branch 'branch-22.08' into list_label
ttnghia May 27, 2022
cc0dfc1
Merge branch 'branch-22.08' into list_label
ttnghia May 27, 2022
79e2f6f
Rewrite comments
ttnghia May 27, 2022
6eec56a
Cleanup headers
ttnghia May 27, 2022
ba91075
Cleanup headers
ttnghia May 27, 2022
de2f197
Cleanup headers
ttnghia May 27, 2022
d296e32
Merge branch 'list_label' into offsets_from_labels
ttnghia May 27, 2022
becb593
Use offsets iterator directly
ttnghia May 28, 2022
e461814
Initialize output at first
ttnghia May 28, 2022
b0d5122
Merge branch 'branch-22.08' into list_label
ttnghia May 28, 2022
b7e6d9a
Fix loop, excluding the last offset value
ttnghia May 28, 2022
a3732ac
Merge branch 'list_label' into offsets_from_labels
ttnghia May 28, 2022
bfe0bf0
Add comment
ttnghia May 28, 2022
42ce30b
Rewrite comment
ttnghia May 28, 2022
7f83afb
Merge branch 'list_label' into offsets_from_labels
ttnghia May 28, 2022
74a33d4
Merge branch 'branch-22.08' into list_label
ttnghia May 28, 2022
15d036a
Try to reverse `sort_helper.cu`
ttnghia May 29, 2022
26aed34
Revert "Try to reverse `sort_helper.cu`"
ttnghia May 29, 2022
a9930b1
Handle the special case when the output array is empty
ttnghia May 29, 2022
10812bb
Reorganize code
ttnghia May 29, 2022
ebd9e2c
Merge branch 'list_label' into offsets_from_labels
ttnghia May 29, 2022
847311b
Add a test
ttnghia May 29, 2022
ae669df
Merge branch 'list_label' into offsets_from_labels
ttnghia May 29, 2022
fcc48e9
Simplify code
ttnghia May 29, 2022
1e7b843
Modify test
ttnghia May 30, 2022
ba58d6f
Merge branch 'branch-22.08' into list_label
ttnghia May 30, 2022
6683469
Merge branch 'list_label' into offsets_from_labels
ttnghia May 30, 2022
6e098a2
Rewrite comment
ttnghia May 30, 2022
8dd7f2d
Change termination condition
ttnghia May 30, 2022
64a107c
Add comment
ttnghia May 31, 2022
77002ef
Fix comment
ttnghia May 31, 2022
136511d
Rename `out_` iterators into `label_`
ttnghia May 31, 2022
fea7d6d
Merge branch 'list_label' into offsets_from_labels
ttnghia Jun 1, 2022
6578a3e
Rewrite comments
ttnghia Jun 1, 2022
0e24124
Merge branch 'branch-22.08' into offsets_from_labels
ttnghia Jun 1, 2022
88d5554
Update comments
ttnghia Jun 1, 2022
c13b4b9
Change all `size_type` into `OutputType`
ttnghia Jun 3, 2022
7a23dcd
Remove casting for `num_segments`
ttnghia Jun 3, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 91 additions & 1 deletion cpp/include/cudf/detail/labeling/label_segments.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,13 @@
#include <cudf/types.hpp>

#include <rmm/cuda_stream_view.hpp>
#include <rmm/device_uvector.hpp>
#include <rmm/exec_policy.hpp>

#include <thrust/distance.h>
#include <thrust/for_each.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/uninitialized_fill.h>

Expand Down Expand Up @@ -75,7 +78,7 @@ void label_segments(InputIterator offsets_begin,

// If the offsets array has no more than 2 offset values, there will be at max 1 segment.
// In such cases, the output will just be an array of all `0` values (which we already filled).
// We should terminate here, otherwise the `inclusive_scan` call below still do its entire
// We should terminate from here, otherwise the `inclusive_scan` call below still does its entire
// computation. That is unnecessary and may be expensive if we have the input offsets defining a
// very large segment.
if (thrust::distance(offsets_begin, offsets_end) <= 2) { return; }
Expand All @@ -96,4 +99,91 @@ void label_segments(InputIterator offsets_begin,
thrust::inclusive_scan(rmm::exec_policy(stream), label_begin, label_end, label_begin);
}

/**
* @brief Generate segment offsets from groups of identical label values.
*
* Given a pair of iterators accessing to an array containing groups of identical label values,
* generate offsets for segments defined by these label.
*
* Empty segments are also taken into account. If the input label values are discontinuous, the
* segments corresponding to the missing labels will be inferred as empty segments and their offsets
* will also be generated.
*
* Note that the caller is responsible to make sure the output range for offsets have the correct
* size, which is the maximum label value plus two (i.e., `size = *(labels_end - 1) + 2`).
* Otherwise, the result is undefined.
*
* @code{.pseudo}
* Examples:
*
* labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ]
* output = [ 0, 4, 6, 6, 6, 10 ]
*
* labels = [ 0, 0, 0, 0, 0, 1, 1 ]
* output = [ 0, 5, 7 ]
* @endcode
*
* @param labels_begin The beginning of the labels that define segments.
* @param labels_end The end of the labels that define segments.
* @param offsets_begin The beginning of the output offset range.
* @param offsets_end The end of the output offset range.
* @param stream CUDA stream used for device memory operations and kernel launches.
*/
template <typename InputIterator, typename OutputIterator>
void labels_to_offsets(InputIterator labels_begin,
InputIterator labels_end,
OutputIterator offsets_begin,
OutputIterator offsets_end,
rmm::cuda_stream_view stream)
{
// Always fill the entire output array with `0` value regardless of the input.
using OutputType = typename thrust::iterator_value<OutputIterator>::type;
thrust::uninitialized_fill(rmm::exec_policy(stream), offsets_begin, offsets_end, OutputType{0});

// If there is not any label value, we will have zero segment or all empty segments. We should
// terminate from here because:
// - If we have zero segment, the output array is empty thus `num_segments` computed below is
// wrong and may cascade to undefined behavior if we continue.
// - If we have all empty segments, the output offset values will be all `0`, which we already
// filled above. If we continue, the `exclusive_scan` call below still does its entire
// computation. That is unnecessary and may be expensive if we have the input labels defining
// a very large number of segments.
if (thrust::distance(labels_begin, labels_end) == 0) { return; }

auto const num_segments = thrust::distance(offsets_begin, offsets_end) - 1;

//================================================================================
// Let's consider an example: Given input labels = [ 0, 0, 0, 0, 1, 1, 4, 4, 4, 4 ].

// This stores the unique label values.
// Given the example above, we will have this array containing [0, 1, 4].
auto list_indices = rmm::device_uvector<OutputType>(num_segments, stream);

// Stores the non-zero segment sizes.
// Given the example above, we will have this array containing [4, 2, 4].
auto list_sizes = rmm::device_uvector<OutputType>(num_segments, stream);

// Count the numbers of labels in the each segment.
auto const end = thrust::reduce_by_key(rmm::exec_policy(stream),
labels_begin, // keys
labels_end, // keys
thrust::make_constant_iterator<OutputType>(1),
list_indices.begin(), // output unique label values
list_sizes.begin()); // count for each label
auto const num_non_empty_segments = thrust::distance(list_indices.begin(), end.first);

// Scatter segment sizes into the end position of their corresponding segment indices.
// Given the example above, we scatter [4, 2, 4] by the scatter map [0, 1, 4], resulting
// output = [4, 2, 0, 0, 4, 0].
thrust::scatter(rmm::exec_policy(stream),
list_sizes.begin(),
list_sizes.begin() + num_non_empty_segments,
list_indices.begin(),
offsets_begin);

// Generate offsets from sizes.
// Given the example above, the final output is [0, 4, 6, 6, 6, 10].
thrust::exclusive_scan(rmm::exec_policy(stream), offsets_begin, offsets_end, offsets_begin);
}

} // namespace cudf::detail
89 changes: 13 additions & 76 deletions cpp/src/lists/drop_list_duplicates.cu
Original file line number Diff line number Diff line change
Expand Up @@ -40,12 +40,7 @@
#include <thrust/distance.h>
#include <thrust/equal.h>
#include <thrust/execution_policy.h>
#include <thrust/iterator/constant_iterator.h>
#include <thrust/iterator/counting_iterator.h>
#include <thrust/reduce.h>
#include <thrust/scan.h>
#include <thrust/scatter.h>
#include <thrust/uninitialized_fill.h>

#include <optional>

Expand Down Expand Up @@ -436,72 +431,6 @@ std::vector<std::unique_ptr<column>> get_unique_entries_and_list_indices(
->release();
}

/**
* @brief Generate list offsets from entry list indices for the final result lists column(s).
*
* @param num_lists The number of lists.
* @param entries_list_indices The mapping from list entries to their (1-based) list indices.
* @param stream CUDA stream used for device memory operations and kernel launches.
* @param mr Device resource used to allocate memory.
*/
std::unique_ptr<column> generate_output_offsets(size_type num_lists,
column_view const& entries_list_indices,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
// Let consider an example:
// Given the original offsets of the input lists column is [0, 4, 5, 6, 7, 10, 11, 13].
// The original entries_list_indices is [1, 1, 1, 1, 2, 3, 4, 5, 5, 5, 6, 7, 7], and after
// extracting unique entries we have the entries_list_indices becomes [1, 1, 1, 4, 5, 5, 5, 7, 7]
// and num_lists is 7. These are the input to this function.
//
// Through extracting unique list entries, one entry in the list index 1 has been removed (first
// list, as we are using 1-based list index), and entries in the lists with indices {3, 3, 6} have
// been removed completely.

// This variable stores the (1-based) list indices of the unique entries but only one index value
// per non-empty list. Given the example above, we will have this array hold the values
// [1, 4, 5, 7].
auto list_indices = rmm::device_uvector<size_type>(num_lists, stream);

// Stores the non-zero numbers of unique entries per list.
// Given the example above, we will have this array contains the values [3, 1, 3, 2]
auto list_sizes = rmm::device_uvector<size_type>(num_lists, stream);

// Count the numbers of unique entries for each non-empty list.
auto const end = thrust::reduce_by_key(rmm::exec_policy(stream),
entries_list_indices.template begin<size_type>(),
entries_list_indices.template end<size_type>(),
thrust::make_constant_iterator<size_type>(1),
list_indices.begin(),
list_sizes.begin());
auto const num_non_empty_lists = thrust::distance(list_indices.begin(), end.first);

// The output offsets for the output lists column(s).
auto new_offsets = rmm::device_uvector<offset_type>(num_lists + 1, stream, mr);

// The new offsets need to be filled with 0 value first.
thrust::uninitialized_fill_n(
rmm::exec_policy(stream), new_offsets.begin(), num_lists + 1, offset_type{0});

// Scatter non-zero sizes of the output lists into the correct positions.
// Given the example above, we will have new_offsets = [0, 3, 0, 0, 1, 3, 0, 2]
thrust::scatter(rmm::exec_policy(stream),
list_sizes.begin(),
list_sizes.begin() + num_non_empty_lists,
list_indices.begin(),
new_offsets.begin());

// Generate offsets from sizes.
// Given the example above, we will have new_offsets = [0, 3, 3, 3, 4, 7, 7, 9]
thrust::exclusive_scan(
rmm::exec_policy(stream), new_offsets.begin(), new_offsets.end(), new_offsets.begin());

// Done. Hope that your head didn't explode after reading till this point.
return std::make_unique<column>(
data_type{type_to_id<offset_type>()}, num_lists + 1, new_offsets.release());
}

/**
* @brief Common execution code called by all public `drop_list_duplicates` APIs.
*/
Expand Down Expand Up @@ -594,11 +523,19 @@ std::pair<std::unique_ptr<column>, std::unique_ptr<column>> drop_list_duplicates
mr);

// Generate offsets for the output lists column(s).
auto output_offsets = generate_output_offsets(
keys.size(),
unique_entries_and_list_indices.back()->view(), // unique entries' list indices
stream,
mr);
auto output_offsets = [&] {
auto out_offsets = make_numeric_column(
data_type{type_to_id<offset_type>()}, keys.size() + 1, mask_state::UNALLOCATED, stream, mr);
auto const offsets = out_offsets->mutable_view();
auto const labels =
unique_entries_and_list_indices.back()->view(); // unique entries' list indices
cudf::detail::labels_to_offsets(labels.template begin<size_type>(),
labels.template end<size_type>(),
offsets.template begin<size_type>(),
offsets.template end<size_type>(),
stream);
return out_offsets;
}();

// If the values lists column is not given, its corresponding output will be nullptr.
auto out_values =
Expand Down