diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh index 3eb77b32353..15b334245ff 100644 --- a/cpp/include/cudf/detail/offsets_iterator.cuh +++ b/cpp/include/cudf/detail/offsets_iterator.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -63,10 +63,11 @@ struct input_offsetalator : base_normalator { * * Use the indexalator_factory to create an iterator instance. * - * @param data Pointer to an integer array in device memory. - * @param dtype Type of data in data + * @param data Pointer to an integer array in device memory + * @param dtype Type of data in data + * @param offset Index value within `offsets` to use as the beginning of the iterator */ - CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype) + CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype, size_type offset = 0) : base_normalator( dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)), p_{static_cast(data)} @@ -78,6 +79,7 @@ struct input_offsetalator : base_normalator { cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) && "Unexpected offsets type"); #endif + p_ += (this->width_ * offset); } protected: diff --git a/cpp/include/cudf/detail/offsets_iterator_factory.cuh b/cpp/include/cudf/detail/offsets_iterator_factory.cuh index 5b4c6b825d2..e234f9ec627 100644 --- a/cpp/include/cudf/detail/offsets_iterator_factory.cuh +++ b/cpp/include/cudf/detail/offsets_iterator_factory.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2023, NVIDIA CORPORATION. + * Copyright (c) 2023-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -28,14 +28,19 @@ namespace detail { struct offsetalator_factory { /** * @brief Create an input offsetalator instance from an offsets column + * + * @param offsets Column to wrap with an offsetalator + * @param offset Index value within `offsets` to use as the beginning of the iterator */ - static input_offsetalator make_input_iterator(column_view const& offsets) + static input_offsetalator make_input_iterator(column_view const& offsets, size_type offset = 0) { - return input_offsetalator(offsets.head(), offsets.type()); + return input_offsetalator(offsets.head(), offsets.type(), offset); } /** * @brief Create an output offsetalator instance from an offsets column + * + * @param offsets Column to wrap with an offsetalator */ static output_offsetalator make_output_iterator(mutable_column_view const& offsets) { diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh index 1523a81d63f..e681373e6e0 100644 --- a/cpp/include/cudf/strings/detail/gather.cuh +++ b/cpp/include/cudf/strings/detail/gather.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -79,7 +80,7 @@ __forceinline__ __device__ uint4 load_uint4(char const* ptr) template __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, char* out_chars, - cudf::device_span const out_offsets, + cudf::detail::input_offsetalator const out_offsets, MapIterator string_indices, size_type total_out_strings) { @@ -109,28 +110,25 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, // between `[out_start_aligned, out_end_aligned)` will be copied using uint4. // `out_start + 4` and `out_end - 4` are used instead of `out_start` and `out_end` to avoid // `load_uint4` reading beyond string boundaries. - int32_t out_start_aligned = + int64_t const out_start_aligned = (out_start + in_datatype_size + alignment_offset + out_datatype_size - 1) / out_datatype_size * out_datatype_size - alignment_offset; - int32_t out_end_aligned = + int64_t const out_end_aligned = (out_end - in_datatype_size + alignment_offset) / out_datatype_size * out_datatype_size - alignment_offset; - for (size_type ichar = out_start_aligned + warp_lane * out_datatype_size; - ichar < out_end_aligned; + for (int64_t ichar = out_start_aligned + warp_lane * out_datatype_size; ichar < out_end_aligned; ichar += cudf::detail::warp_size * out_datatype_size) { *(out_chars_aligned + (ichar + alignment_offset) / out_datatype_size) = load_uint4(in_start + ichar - out_start); } - // Tail logic: copy characters of the current string outside `[out_start_aligned, - // out_end_aligned)`. + // Copy characters of the current string outside [out_start_aligned, out_end_aligned) if (out_end_aligned <= out_start_aligned) { // In this case, `[out_start_aligned, out_end_aligned)` is an empty set, and we copy the // entire string. - for (int32_t ichar = out_start + warp_lane; ichar < out_end; - ichar += cudf::detail::warp_size) { + for (auto ichar = out_start + warp_lane; ichar < out_end; ichar += cudf::detail::warp_size) { out_chars[ichar] = in_start[ichar - out_start]; } } else { @@ -139,7 +137,7 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, out_chars[out_start + warp_lane] = in_start[warp_lane]; } // Copy characters in range `[out_end_aligned, out_end)`. - int32_t ichar = out_end_aligned + warp_lane; + auto const ichar = out_end_aligned + warp_lane; if (ichar < out_end) { out_chars[ichar] = in_start[ichar - out_start]; } } } @@ -164,11 +162,11 @@ __global__ void gather_chars_fn_string_parallel(StringIterator strings_begin, template __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, char* out_chars, - cudf::device_span const out_offsets, + cudf::detail::input_offsetalator const out_offsets, MapIterator string_indices, size_type total_out_strings) { - __shared__ int32_t out_offsets_threadblock[strings_per_threadblock + 1]; + __shared__ int64_t out_offsets_threadblock[strings_per_threadblock + 1]; // Current thread block will process output strings starting at `begin_out_string_idx`. size_type begin_out_string_idx = blockIdx.x * strings_per_threadblock; @@ -185,7 +183,7 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, } __syncthreads(); - for (int32_t out_ibyte = threadIdx.x + out_offsets_threadblock[0]; + for (int64_t out_ibyte = threadIdx.x + out_offsets_threadblock[0]; out_ibyte < out_offsets_threadblock[strings_current_threadblock]; out_ibyte += blockDim.x) { // binary search for the string index corresponding to out_ibyte @@ -197,7 +195,7 @@ __global__ void gather_chars_fn_char_parallel(StringIterator strings_begin, size_type string_idx = thrust::distance(out_offsets_threadblock, string_idx_iter); // calculate which character to load within the string - int32_t icharacter = out_ibyte - out_offsets_threadblock[string_idx]; + auto const icharacter = out_ibyte - out_offsets_threadblock[string_idx]; size_type in_string_idx = string_indices[begin_out_string_idx + string_idx]; out_chars[out_ibyte] = strings_begin[in_string_idx].data()[icharacter]; @@ -227,7 +225,7 @@ template std::unique_ptr gather_chars(StringIterator strings_begin, MapIterator map_begin, MapIterator map_end, - cudf::device_span const offsets, + cudf::detail::input_offsetalator const offsets, size_type chars_bytes, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) @@ -300,7 +298,9 @@ std::unique_ptr gather(strings_column_view const& strings, // build offsets column auto const d_strings = column_device_view::create(strings.parent(), stream); - auto const d_in_offsets = !strings.is_empty() ? strings.offsets_begin() : nullptr; + auto const d_in_offsets = cudf::detail::offsetalator_factory::make_input_iterator( + strings.is_empty() ? make_empty_column(type_id::INT32)->view() : strings.offsets(), + strings.offset()); auto offsets_itr = thrust::make_transform_iterator( begin, @@ -308,14 +308,15 @@ std::unique_ptr gather(strings_column_view const& strings, [d_strings = *d_strings, d_in_offsets] __device__(size_type idx) { if (NullifyOutOfBounds && (idx < 0 || idx >= d_strings.size())) { return 0; } if (not d_strings.is_valid(idx)) { return 0; } - return d_in_offsets[idx + 1] - d_in_offsets[idx]; + return static_cast(d_in_offsets[idx + 1] - d_in_offsets[idx]); })); auto [out_offsets_column, total_bytes] = cudf::detail::make_offsets_child_column(offsets_itr, offsets_itr + output_count, stream, mr); // build chars column - auto const offsets_view = out_offsets_column->view(); - auto out_chars_column = gather_chars( + auto const offsets_view = + cudf::detail::offsetalator_factory::make_input_iterator(out_offsets_column->view()); + auto out_chars_column = gather_chars( d_strings->begin(), begin, end, offsets_view, total_bytes, stream, mr); return make_strings_column(output_count, diff --git a/cpp/include/cudf/strings/detail/strings_column_factories.cuh b/cpp/include/cudf/strings/detail/strings_column_factories.cuh index 15b1c2bfec4..de7db4ce47b 100644 --- a/cpp/include/cudf/strings/detail/strings_column_factories.cuh +++ b/cpp/include/cudf/strings/detail/strings_column_factories.cuh @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -103,9 +103,8 @@ std::unique_ptr make_strings_column(IndexPairIterator begin, auto const avg_bytes_per_row = bytes / std::max(strings_count - null_count, 1); // use a character-parallel kernel for long string lengths if (avg_bytes_per_row > FACTORY_BYTES_PER_ROW_THRESHOLD) { - auto const d_data = offsets_view.template data(); auto const d_offsets = - device_span{d_data, static_cast(offsets_view.size())}; + cudf::detail::offsetalator_factory::make_input_iterator(offsets_view); auto const str_begin = thrust::make_transform_iterator( begin, cuda::proclaim_return_type([] __device__(auto ip) { return string_view{ip.first, ip.second};