Skip to content

Commit

Permalink
Use offsetalator in gather_chars (#14700)
Browse files Browse the repository at this point in the history
Update `cudf::strings::detail::gather_chars()` utility to use the offsetalator iterators. 
The offsetalator enables the utility to use int32 or int64 offsets in the future with no additional changes.
This utility is mainly used by a `make_strings_column` factory and is already optimized for columns with both long and short strings.
The `input_offsetalator` is also updated to include an additional optional offset parameter for cases where the parent strings column has been sliced and requires building the iterator starting from a non-zero index within the offsets child column.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Nghia Truong (https://github.com/ttnghia)
  - Yunsong Wang (https://github.com/PointKernel)

URL: #14700
  • Loading branch information
davidwendt authored Jan 8, 2024
1 parent 6083efa commit fc142eb
Show file tree
Hide file tree
Showing 4 changed files with 37 additions and 30 deletions.
10 changes: 6 additions & 4 deletions cpp/include/cudf/detail/offsets_iterator.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -63,10 +63,11 @@ struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
*
* Use the indexalator_factory to create an iterator instance.
*
* @param data Pointer to an integer array in device memory.
* @param dtype Type of data in data
* @param data Pointer to an integer array in device memory
* @param dtype Type of data in data
* @param offset Index value within `offsets` to use as the beginning of the iterator
*/
CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype)
CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype, size_type offset = 0)
: base_normalator<input_offsetalator, int64_t>(
dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
p_{static_cast<char const*>(data)}
Expand All @@ -78,6 +79,7 @@ struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
"Unexpected offsets type");
#endif
p_ += (this->width_ * offset);
}

protected:
Expand Down
11 changes: 8 additions & 3 deletions cpp/include/cudf/detail/offsets_iterator_factory.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2023, NVIDIA CORPORATION.
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -28,14 +28,19 @@ namespace detail {
struct offsetalator_factory {
/**
* @brief Create an input offsetalator instance from an offsets column
*
* @param offsets Column to wrap with an offsetalator
* @param offset Index value within `offsets` to use as the beginning of the iterator
*/
static input_offsetalator make_input_iterator(column_view const& offsets)
static input_offsetalator make_input_iterator(column_view const& offsets, size_type offset = 0)
{
return input_offsetalator(offsets.head(), offsets.type());
return input_offsetalator(offsets.head(), offsets.type(), offset);
}

/**
* @brief Create an output offsetalator instance from an offsets column
*
* @param offsets Column to wrap with an offsetalator
*/
static output_offsetalator make_output_iterator(mutable_column_view const& offsets)
{
Expand Down
41 changes: 21 additions & 20 deletions cpp/include/cudf/strings/detail/gather.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2019-2023, NVIDIA CORPORATION.
* Copyright (c) 2019-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand All @@ -18,6 +18,7 @@
#include <cudf/column/column.hpp>
#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/offsets_iterator_factory.cuh>
#include <cudf/detail/sizes_to_offsets_iterator.cuh>
#include <cudf/detail/utilities/cuda.cuh>
#include <cudf/strings/detail/utilities.hpp>
Expand Down Expand Up @@ -79,7 +80,7 @@ __forceinline__ __device__ uint4 load_uint4(char const* ptr)
template <typename StringIterator, typename MapIterator>
__global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
char* out_chars,
cudf::device_span<int32_t const> const out_offsets,
cudf::detail::input_offsetalator const out_offsets,
MapIterator string_indices,
size_type total_out_strings)
{
Expand Down Expand Up @@ -109,28 +110,25 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
// between `[out_start_aligned, out_end_aligned)` will be copied using uint4.
// `out_start + 4` and `out_end - 4` are used instead of `out_start` and `out_end` to avoid
// `load_uint4` reading beyond string boundaries.
int32_t out_start_aligned =
int64_t const out_start_aligned =
(out_start + in_datatype_size + alignment_offset + out_datatype_size - 1) /
out_datatype_size * out_datatype_size -
alignment_offset;
int32_t out_end_aligned =
int64_t const out_end_aligned =
(out_end - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size -
alignment_offset;

for (size_type ichar = out_start_aligned + warp_lane * out_datatype_size;
ichar < out_end_aligned;
for (int64_t ichar = out_start_aligned + warp_lane * out_datatype_size; ichar < out_end_aligned;
ichar += cudf::detail::warp_size * out_datatype_size) {
*(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) =
load_uint4(in_start + ichar - out_start);
}

// Tail logic: copy characters of the current string outside `[out_start_aligned,
// out_end_aligned)`.
// Copy characters of the current string outside [out_start_aligned, out_end_aligned)
if (out_end_aligned <= out_start_aligned) {
// In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the
// entire string.
for (int32_t ichar = out_start + warp_lane; ichar < out_end;
ichar += cudf::detail::warp_size) {
for (auto ichar = out_start + warp_lane; ichar < out_end; ichar += cudf::detail::warp_size) {
out_chars[ichar] = in_start[ichar - out_start];
}
} else {
Expand All @@ -139,7 +137,7 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
out_chars[out_start + warp_lane] = in_start[warp_lane];
}
// Copy characters in range `[out_end_aligned, out_end)`.
int32_t ichar = out_end_aligned + warp_lane;
auto const ichar = out_end_aligned + warp_lane;
if (ichar < out_end) { out_chars[ichar] = in_start[ichar - out_start]; }
}
}
Expand All @@ -164,11 +162,11 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin,
template <int strings_per_threadblock, typename StringIterator, typename MapIterator>
__global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
char* out_chars,
cudf::device_span<int32_t const> const out_offsets,
cudf::detail::input_offsetalator const out_offsets,
MapIterator string_indices,
size_type total_out_strings)
{
__shared__ int32_t out_offsets_threadblock[strings_per_threadblock + 1];
__shared__ int64_t out_offsets_threadblock[strings_per_threadblock + 1];

// Current thread block will process output strings starting at `begin_out_string_idx`.
size_type begin_out_string_idx = blockIdx.x * strings_per_threadblock;
Expand All @@ -185,7 +183,7 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
}
__syncthreads();

for (int32_t out_ibyte = threadIdx.x + out_offsets_threadblock[0];
for (int64_t out_ibyte = threadIdx.x + out_offsets_threadblock[0];
out_ibyte < out_offsets_threadblock[strings_current_threadblock];
out_ibyte += blockDim.x) {
// binary search for the string index corresponding to out_ibyte
Expand All @@ -197,7 +195,7 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin,
size_type string_idx = thrust::distance(out_offsets_threadblock, string_idx_iter);

// calculate which character to load within the string
int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx];
auto const icharacter = out_ibyte - out_offsets_threadblock[string_idx];

size_type in_string_idx = string_indices[begin_out_string_idx + string_idx];
out_chars[out_ibyte] = strings_begin[in_string_idx].data()[icharacter];
Expand Down Expand Up @@ -227,7 +225,7 @@ template <typename StringIterator, typename MapIterator>
std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
MapIterator map_begin,
MapIterator map_end,
cudf::device_span<int32_t const> const offsets,
cudf::detail::input_offsetalator const offsets,
size_type chars_bytes,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
Expand Down Expand Up @@ -300,22 +298,25 @@ std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
// build offsets column
auto const d_strings = column_device_view::create(strings.parent(), stream);
auto const d_in_offsets = !strings.is_empty() ? strings.offsets_begin() : nullptr;
auto const d_in_offsets = cudf::detail::offsetalator_factory::make_input_iterator(
strings.is_empty() ? make_empty_column(type_id::INT32)->view() : strings.offsets(),
strings.offset());
auto offsets_itr = thrust::make_transform_iterator(
begin,
cuda::proclaim_return_type<size_type>(
[d_strings = *d_strings, d_in_offsets] __device__(size_type idx) {
if (NullifyOutOfBounds && (idx < 0 || idx >= d_strings.size())) { return 0; }
if (not d_strings.is_valid(idx)) { return 0; }
return d_in_offsets[idx + 1] - d_in_offsets[idx];
return static_cast<size_type>(d_in_offsets[idx + 1] - d_in_offsets[idx]);
}));
auto [out_offsets_column, total_bytes] =
cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr);
// build chars column
auto const offsets_view = out_offsets_column->view();
auto out_chars_column = gather_chars(
auto const offsets_view =
cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view());
auto out_chars_column = gather_chars(
d_strings->begin<string_view>(), begin, end, offsets_view, total_bytes, stream, mr);
return make_strings_column(output_count,
Expand Down
5 changes: 2 additions & 3 deletions cpp/include/cudf/strings/detail/strings_column_factories.cuh
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
Expand Down Expand Up @@ -103,9 +103,8 @@ std::unique_ptr<column> make_strings_column(IndexPairIterator begin,
auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1);
// use a character-parallel kernel for long string lengths
if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) {
auto const d_data = offsets_view.template data<size_type>();
auto const d_offsets =
device_span<size_type const>{d_data, static_cast<std::size_t>(offsets_view.size())};
cudf::detail::offsetalator_factory::make_input_iterator(offsets_view);
auto const str_begin = thrust::make_transform_iterator(
begin, cuda::proclaim_return_type<string_view>([] __device__(auto ip) {
return string_view{ip.first, ip.second};
Expand Down

0 comments on commit fc142eb

Please sign in to comment.