diff --git a/cpp/benchmarks/string/copy_benchmark.cpp b/cpp/benchmarks/string/copy_benchmark.cpp index b49bc878ca7..23a70215015 100644 --- a/cpp/benchmarks/string/copy_benchmark.cpp +++ b/cpp/benchmarks/string/copy_benchmark.cpp @@ -74,6 +74,9 @@ static void generate_bench_args(benchmark::internal::Benchmark* b) int const max_rowlen = 1 << 13; int const len_mult = 4; generate_string_bench_args(b, min_rows, max_rows, row_mult, min_rowlen, max_rowlen, len_mult); + + // Benchmark for very small strings + b->Args({67108864, 2}); } #define COPY_BENCHMARK_DEFINE(name) \ diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 86f79881408..dcd17245ee6 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -34,6 +35,169 @@ namespace cudf { namespace strings { namespace detail { +// Helper function for loading 16B from a potentially unaligned memory location to registers. +__forceinline__ __device__ uint4 load_uint4(const char* ptr) +{ + auto const offset = reinterpret_cast(ptr) % 4; + auto const* aligned_ptr = reinterpret_cast(ptr - offset); + auto const shift = offset * 8; + + uint4 regs = {aligned_ptr[0], aligned_ptr[1], aligned_ptr[2], aligned_ptr[3]}; + uint tail = 0; + if (shift) tail = aligned_ptr[4]; + + regs.x = __funnelshift_r(regs.x, regs.y, shift); + regs.y = __funnelshift_r(regs.y, regs.z, shift); + regs.z = __funnelshift_r(regs.z, regs.w, shift); + regs.w = __funnelshift_r(regs.w, tail, shift); + + return regs; +} + +/** + * @brief Gather characters from the input iterator, with string parallel strategy. + * + * This strategy assigns strings to warps so that each warp can cooperatively copy from the input + * location of the string to the corresponding output location. Large datatype (uint4) is used for + * stores. This strategy is best suited for large strings. + * + * @tparam StringIterator Iterator should produce `string_view` objects. + * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`. + * + * @param strings_begin Start of the iterator to retrieve `string_view` instances. + * @param out_chars Output buffer for gathered characters. + * @param out_offsets The offset values associated with the output buffer. + * @param string_indices Start of index iterator. + * @param total_out_strings Number of output strings to be gathered. + */ +template +__global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, + char* out_chars, + cudf::device_span const out_offsets, + MapIterator string_indices, + size_type total_out_strings) +{ + constexpr size_t out_datatype_size = sizeof(uint4); + constexpr size_t in_datatype_size = sizeof(uint); + + int global_thread_id = blockIdx.x * blockDim.x + threadIdx.x; + int global_warp_id = global_thread_id / cudf::detail::warp_size; + int warp_lane = global_thread_id % cudf::detail::warp_size; + int nwarps = gridDim.x * blockDim.x / cudf::detail::warp_size; + + auto const alignment_offset = reinterpret_cast(out_chars) % out_datatype_size; + uint4* out_chars_aligned = reinterpret_cast(out_chars - alignment_offset); + + for (size_type istring = global_warp_id; istring < total_out_strings; istring += nwarps) { + auto const out_start = out_offsets[istring]; + auto const out_end = out_offsets[istring + 1]; + + // This check is necessary because string_indices[istring] may be out of bound. + if (out_start == out_end) continue; + + const char* in_start = strings_begin[string_indices[istring]].data(); + + // Both `out_start_aligned` and `out_end_aligned` are indices into `out_chars`. + // `out_start_aligned` is the first 16B aligned memory location after `out_start + 4`. + // `out_end_aligned` is the last 16B aligned memory location before `out_end - 4`. Characters + // between `[out_start_aligned, out_end_aligned)` will be copied using uint4. + // `out_start + 4` and `out_end - 4` are used instead of `out_start` and `out_end` to avoid + // `load_uint4` reading beyond string boundaries. + int32_t out_start_aligned = + (out_start + in_datatype_size + alignment_offset + out_datatype_size - 1) / + out_datatype_size * out_datatype_size - + alignment_offset; + int32_t out_end_aligned = + (out_end - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size - + alignment_offset; + + for (size_type ichar = out_start_aligned + warp_lane * out_datatype_size; + ichar < out_end_aligned; + ichar += cudf::detail::warp_size * out_datatype_size) { + *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) = + load_uint4(in_start + ichar - out_start); + } + + // Tail logic: copy characters of the current string outside `[out_start_aligned, + // out_end_aligned)`. + if (out_end_aligned <= out_start_aligned) { + // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the + // entire string. + for (int32_t ichar = out_start + warp_lane; ichar < out_end; + ichar += cudf::detail::warp_size) { + out_chars[ichar] = in_start[ichar - out_start]; + } + } else { + // Copy characters in range `[out_start, out_start_aligned)`. + if (out_start + warp_lane < out_start_aligned) { + out_chars[out_start + warp_lane] = in_start[warp_lane]; + } + // Copy characters in range `[out_end_aligned, out_end)`. + int32_t ichar = out_end_aligned + warp_lane; + if (ichar < out_end) { out_chars[ichar] = in_start[ichar - out_start]; } + } + } +} + +/** + * @brief Gather characters from the input iterator, with char parallel strategy. + * + * This strategy assigns characters to threads, and uses binary search for getting the string + * index. To improve the binary search performance, fixed number of strings per threadblock is + * used. This strategy is best suited for small strings. + * + * @tparam StringIterator Iterator should produce `string_view` objects. + * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`. + * + * @param strings_begin Start of the iterator to retrieve `string_view` instances. + * @param out_chars Output buffer for gathered characters. + * @param out_offsets The offset values associated with the output buffer. + * @param string_indices Start of index iterator. + * @param total_out_strings Number of output strings to be gathered. + */ +template +__global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, + char* out_chars, + cudf::device_span const out_offsets, + MapIterator string_indices, + size_type total_out_strings) +{ + __shared__ int32_t out_offsets_threadblock[strings_per_threadblock + 1]; + + // Current thread block will process output strings starting at `begin_out_string_idx`. + size_type begin_out_string_idx = blockIdx.x * strings_per_threadblock; + + // Number of strings to be processed by the current threadblock. + size_type strings_current_threadblock = + min(strings_per_threadblock, total_out_strings - begin_out_string_idx); + + if (strings_current_threadblock <= 0) return; + + // Collectively load offsets of strings processed by the current thread block. + for (size_type idx = threadIdx.x; idx <= strings_current_threadblock; idx += blockDim.x) { + out_offsets_threadblock[idx] = out_offsets[idx + begin_out_string_idx]; + } + __syncthreads(); + + for (int32_t out_ibyte = threadIdx.x + out_offsets_threadblock[0]; + out_ibyte < out_offsets_threadblock[strings_current_threadblock]; + out_ibyte += blockDim.x) { + // binary search for the string index corresponding to out_ibyte + auto const string_idx_iter = + thrust::prev(thrust::upper_bound(thrust::seq, + out_offsets_threadblock, + out_offsets_threadblock + strings_current_threadblock, + out_ibyte)); + size_type string_idx = thrust::distance(out_offsets_threadblock, string_idx_iter); + + // calculate which character to load within the string + int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx]; + + size_type in_string_idx = string_indices[begin_out_string_idx + string_idx]; + out_chars[out_ibyte] = strings_begin[in_string_idx].data()[icharacter]; + } +} + /** * @brief Returns a new chars column using the specified indices to select * strings from the input iterator. @@ -44,7 +208,7 @@ namespace detail { * @tparam StringIterator Iterator should produce `string_view` objects. * @tparam MapIterator Iterator for retrieving integer indices of the `StringIterator`. * - * @param strings_begin Start of the iterator to retrieve `string_view` instances + * @param strings_begin Start of the iterator to retrieve `string_view` instances. * @param map_begin Start of index iterator. * @param map_end End of index iterator. * @param offsets The offset values to be associated with the output chars column. @@ -68,20 +232,29 @@ std::unique_ptr gather_chars(StringIterator strings_begin, auto chars_column = create_chars_child_column(output_count, chars_bytes, stream, mr); auto const d_chars = chars_column->mutable_view().template data(); - auto gather_chars_fn = [strings_begin, map_begin, offsets] __device__(size_type out_idx) -> char { - auto const out_row = - thrust::prev(thrust::upper_bound(thrust::seq, offsets.begin(), offsets.end(), out_idx)); - auto const row_idx = map_begin[thrust::distance(offsets.begin(), out_row)]; // get row index - auto const d_str = strings_begin[row_idx]; // get row's string - auto const offset = out_idx - *out_row; // get string's char - return d_str.data()[offset]; - }; + constexpr int warps_per_threadblock = 4; + // String parallel strategy will be used if average string length is above this threshold. + // Otherwise, char parallel strategy will be used. + constexpr size_type string_parallel_threshold = 32; - thrust::transform(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - d_chars, - gather_chars_fn); + size_type average_string_length = chars_bytes / output_count; + + if (average_string_length > string_parallel_threshold) { + constexpr int max_threadblocks = 65536; + gather_chars_fn_string_parallel<<< + min((static_cast(output_count) + warps_per_threadblock - 1) / warps_per_threadblock, + max_threadblocks), + warps_per_threadblock * cudf::detail::warp_size, + 0, + stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count); + } else { + constexpr int strings_per_threadblock = 32; + gather_chars_fn_char_parallel + <<<(output_count + strings_per_threadblock - 1) / strings_per_threadblock, + warps_per_threadblock * cudf::detail::warp_size, + 0, + stream.value()>>>(strings_begin, d_chars, offsets, map_begin, output_count); + } return chars_column; }