diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu index c87c36ba3b9..fbab5220383 100644 --- a/cpp/src/strings/split/split.cu +++ b/cpp/src/strings/split/split.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2023, NVIDIA CORPORATION. + * Copyright (c) 2019-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,7 +19,6 @@ #include #include #include -#include #include #include #include @@ -123,7 +122,7 @@ std::unique_ptr split_fn(strings_column_view const& input, // builds the offsets and the vector of all tokens auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr); - auto const d_offsets = offsets->view().template data(); + auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); auto const d_tokens = tokens.data(); // compute the maximum number of tokens for any string @@ -132,7 +131,7 @@ std::unique_ptr
split_fn(strings_column_view const& input, thrust::make_counting_iterator(0), thrust::make_counting_iterator(input.size()), cuda::proclaim_return_type([d_offsets] __device__(auto idx) -> size_type { - return d_offsets[idx + 1] - d_offsets[idx]; + return static_cast(d_offsets[idx + 1] - d_offsets[idx]); }), 0, thrust::maximum{}); @@ -144,7 +143,7 @@ std::unique_ptr
split_fn(strings_column_view const& input, cuda::proclaim_return_type( [d_tokens, d_offsets, col] __device__(size_type idx) { auto const offset = d_offsets[idx]; - auto const token_count = d_offsets[idx + 1] - offset; + auto const token_count = static_cast(d_offsets[idx + 1] - offset); return (col < token_count) ? d_tokens[offset + col] : string_index_pair{nullptr, 0}; })); results.emplace_back(make_strings_column(itr, itr + input.size(), stream, mr)); @@ -360,12 +359,11 @@ std::unique_ptr
whitespace_split_fn(size_type strings_count, } // get the positions for every token - rmm::device_uvector tokens(columns_count * strings_count, stream); + rmm::device_uvector tokens( + static_cast(columns_count) * static_cast(strings_count), stream); string_index_pair* d_tokens = tokens.data(); - thrust::fill(rmm::exec_policy(stream), - d_tokens, - d_tokens + (columns_count * strings_count), - string_index_pair{nullptr, 0}); + thrust::fill( + rmm::exec_policy(stream), tokens.begin(), tokens.end(), string_index_pair{nullptr, 0}); thrust::for_each_n(rmm::exec_policy(stream), thrust::make_counting_iterator(0), strings_count, diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh index c5fb44fc3dd..906c522e898 100644 --- a/cpp/src/strings/split/split.cuh +++ b/cpp/src/strings/split/split.cuh @@ -17,9 +17,9 @@ #include #include #include -#include #include #include +#include #include #include #include @@ -66,9 +66,9 @@ struct base_split_tokenizer { * @param chars_bytes Total number of characters to process * @return true if delimiter is found starting at position `idx` */ - __device__ bool is_delimiter(size_type idx, - size_type const* d_offsets, - size_type chars_bytes) const + __device__ bool is_delimiter(int64_t idx, + cudf::detail::input_offsetalator const d_offsets, + int64_t chars_bytes) const { auto const d_chars = get_base_ptr() + d_offsets[0]; if (idx + d_delimiter.size_bytes() > chars_bytes) { return false; } @@ -87,8 +87,8 @@ struct base_split_tokenizer { * @param d_delimiter_offsets Offsets per string to delimiters in d_positions */ __device__ size_type count_tokens(size_type idx, - size_type const* d_positions, - size_type const* d_delimiter_offsets) const + int64_t const* d_positions, + int64_t const* d_delimiter_offsets) const { if (!is_valid(idx)) { return 0; } @@ -96,12 +96,13 @@ struct base_split_tokenizer { auto const d_str = get_string(idx); auto const d_str_end = d_str.data() + d_str.size_bytes(); auto const base_ptr = get_base_ptr() + delim_size - 1; + auto const delimiters = - cudf::device_span(d_positions + d_delimiter_offsets[idx], - d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]); + cudf::device_span(d_positions + d_delimiter_offsets[idx], + d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]); size_type token_count = 1; // all strings will have at least one token - size_type last_pos = delimiters[0] - delim_size; + auto last_pos = !delimiters.empty() ? (delimiters[0] - delim_size) : 0L; for (auto d_pos : delimiters) { // delimiter must fit in string && overlapping delimiters are ignored if (((base_ptr + d_pos) < d_str_end) && ((d_pos - last_pos) >= delim_size)) { @@ -129,9 +130,9 @@ struct base_split_tokenizer { * @param d_all_tokens All output tokens for the strings column */ __device__ void get_tokens(size_type idx, - size_type const* d_tokens_offsets, - size_type const* d_positions, - size_type const* d_delimiter_offsets, + cudf::detail::input_offsetalator const d_tokens_offsets, + int64_t const* d_positions, + int64_t const* d_delimiter_offsets, string_index_pair* d_all_tokens) const { auto const d_tokens = // this string's tokens output @@ -149,8 +150,8 @@ struct base_split_tokenizer { } auto const delimiters = - cudf::device_span(d_positions + d_delimiter_offsets[idx], - d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]); + cudf::device_span(d_positions + d_delimiter_offsets[idx], + d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]); auto& derived = static_cast(*this); derived.process_tokens(d_str, delimiters, d_tokens); @@ -184,7 +185,7 @@ struct split_tokenizer_fn : base_split_tokenizer { * @param d_tokens Output vector to store tokens for this string */ __device__ void process_tokens(string_view const d_str, - device_span d_delimiters, + device_span d_delimiters, device_span d_tokens) const { auto const base_ptr = get_base_ptr(); // d_positions values based on this @@ -239,7 +240,7 @@ struct rsplit_tokenizer_fn : base_split_tokenizer { * @param d_tokens Output vector to store tokens for this string */ __device__ void process_tokens(string_view const d_str, - device_span d_delimiters, + device_span d_delimiters, device_span d_tokens) const { auto const base_ptr = get_base_ptr(); // d_positions values are based on this ptr @@ -290,7 +291,8 @@ struct rsplit_tokenizer_fn : base_split_tokenizer { * @param input The input column of strings to split * @param tokenizer Object used for counting and identifying delimiters and tokens * @param stream CUDA stream used for device memory operations and kernel launches - * @param mr Device memory resource used to allocate the returned objects' device memory. + * @param mr Device memory resource used to allocate the returned objects' device memory + * @return Token offsets and a vector of string indices */ template std::pair, rmm::device_uvector> split_helper( @@ -301,37 +303,38 @@ std::pair, rmm::device_uvector> split { auto const strings_count = input.size(); auto const chars_bytes = - cudf::detail::get_value(input.offsets(), input.offset() + strings_count, stream) - - cudf::detail::get_value(input.offsets(), input.offset(), stream); - - auto d_offsets = input.offsets_begin(); + get_offset_value(input.offsets(), input.offset() + strings_count, stream) - + get_offset_value(input.offsets(), input.offset(), stream); + auto const d_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset()); // count the number of delimiters in the entire column auto const delimiter_count = thrust::count_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { + thrust::counting_iterator(0), + thrust::counting_iterator(chars_bytes), + [tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) { return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); }); + // Create a vector of every delimiter position in the chars column. // These may include overlapping or otherwise out-of-bounds delimiters which // will be resolved during token processing. - auto delimiter_positions = rmm::device_uvector(delimiter_count, stream); + auto delimiter_positions = rmm::device_uvector(delimiter_count, stream); auto d_positions = delimiter_positions.data(); - auto const copy_end = - thrust::copy_if(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - thrust::make_counting_iterator(chars_bytes), - delimiter_positions.begin(), - [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) { - return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); - }); + auto const copy_end = cudf::detail::copy_if_safe( + thrust::counting_iterator(0), + thrust::counting_iterator(chars_bytes), + delimiter_positions.begin(), + [tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) { + return tokenizer.is_delimiter(idx, d_offsets, chars_bytes); + }, + stream); // create a vector of offsets to each string's delimiter set within delimiter_positions auto const delimiter_offsets = [&] { // first, create a vector of string indices for each delimiter - auto string_indices = rmm::device_uvector(delimiter_count, stream); + auto string_indices = rmm::device_uvector(delimiter_count, stream); thrust::upper_bound(rmm::exec_policy(stream), d_offsets, d_offsets + strings_count, @@ -340,24 +343,24 @@ std::pair, rmm::device_uvector> split string_indices.begin()); // compute delimiter offsets per string - auto delimiter_offsets = rmm::device_uvector(strings_count + 1, stream); + auto delimiter_offsets = rmm::device_uvector(strings_count + 1, stream); auto d_delimiter_offsets = delimiter_offsets.data(); // memset to zero-out the delimiter counts for any null-entries or strings with no delimiters CUDF_CUDA_TRY(cudaMemsetAsync( - d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(size_type), stream.value())); + d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(int64_t), stream.value())); // next, count the number of delimiters per string auto d_string_indices = string_indices.data(); // identifies strings with delimiters only - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - delimiter_count, - [d_string_indices, d_delimiter_offsets] __device__(size_type idx) { - auto const str_idx = d_string_indices[idx] - 1; - cuda::atomic_ref ref{ - *(d_delimiter_offsets + str_idx)}; - ref.fetch_add(1, cuda::std::memory_order_relaxed); - }); + thrust::for_each_n( + rmm::exec_policy(stream), + thrust::counting_iterator(0), + delimiter_count, + [d_string_indices, d_delimiter_offsets] __device__(int64_t idx) { + auto const str_idx = d_string_indices[idx] - 1; + cuda::atomic_ref ref{*(d_delimiter_offsets + str_idx)}; + ref.fetch_add(1L, cuda::std::memory_order_relaxed); + }); // finally, convert the delimiter counts into offsets thrust::exclusive_scan(rmm::exec_policy(stream), delimiter_offsets.begin(), @@ -379,11 +382,10 @@ std::pair, rmm::device_uvector> split }); // create offsets from the counts for return to the caller - auto offsets = std::get<0>( - cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr)); - auto const total_tokens = - cudf::detail::get_value(offsets->view(), strings_count, stream); - auto const d_tokens_offsets = offsets->view().data(); + auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column( + token_counts.begin(), token_counts.end(), stream, mr); + auto const d_tokens_offsets = + cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); // build a vector of all the token positions for all the strings auto tokens = rmm::device_uvector(total_tokens, stream); diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu index 045aac279e6..d8385549840 100644 --- a/cpp/src/strings/split/split_re.cu +++ b/cpp/src/strings/split/split_re.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2022-2023, NVIDIA CORPORATION. + * Copyright (c) 2022-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -21,7 +21,6 @@ #include #include #include -#include #include #include #include @@ -36,7 +35,6 @@ #include #include #include -#include namespace cudf { namespace strings { @@ -60,7 +58,7 @@ enum class split_direction { struct token_reader_fn { column_device_view const d_strings; split_direction const direction; - size_type const* d_token_offsets; + cudf::detail::input_offsetalator const d_token_offsets; string_index_pair* d_tokens; __device__ void operator()(size_type const idx, reprog_device const prog, int32_t const prog_idx) @@ -73,9 +71,9 @@ struct token_reader_fn { auto const token_count = d_token_offsets[idx + 1] - token_offset; auto const d_result = d_tokens + token_offset; // store tokens here - size_type token_idx = 0; - auto itr = d_str.begin(); - auto last_pos = itr; + int64_t token_idx = 0; + auto itr = d_str.begin(); + auto last_pos = itr; while (itr.position() <= nchars) { auto const match = prog.find(prog_idx, d_str, itr); if (!match) { break; } @@ -90,7 +88,7 @@ struct token_reader_fn { d_result[token_idx++] = token; } else { if (direction == split_direction::FORWARD) { break; } // we are done - for (auto l = 0; l < token_idx - 1; ++l) { + for (auto l = 0L; l < token_idx - 1; ++l) { d_result[l] = d_result[l + 1]; // shift left } d_result[token_idx - 1] = token; @@ -120,50 +118,45 @@ struct token_reader_fn { /** * @brief Call regex to split each input string into tokens. * - * This will also convert the `offsets` values from counts to offsets. - * * @param d_strings Strings to split * @param d_prog Regex to evaluate against each string * @param direction Whether tokens are generated forwards or backwards. * @param max_tokens The maximum number of tokens for each split. - * @param offsets The number of matches on input. - * The offsets for each token in each string on output. + * @param counts The number of tokens in each string * @param stream CUDA stream used for kernel launches. */ -rmm::device_uvector generate_tokens(column_device_view const& d_strings, - reprog_device& d_prog, - split_direction direction, - size_type maxsplit, - mutable_column_view& offsets, - rmm::cuda_stream_view stream) +std::pair, std::unique_ptr> generate_tokens( + column_device_view const& d_strings, + reprog_device& d_prog, + split_direction direction, + size_type maxsplit, + column_view const& counts, + rmm::cuda_stream_view stream) { auto const strings_count = d_strings.size(); - - auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); - - auto const begin = thrust::make_counting_iterator(0); - auto const end = thrust::make_counting_iterator(strings_count); - auto const d_offsets = offsets.data(); + auto const max_tokens = maxsplit > 0 ? maxsplit : std::numeric_limits::max(); + auto const d_counts = counts.data(); // convert match counts to token offsets - auto map_fn = [d_strings, d_offsets, max_tokens] __device__(auto idx) { - return d_strings.is_null(idx) ? 0 : std::min(d_offsets[idx], max_tokens) + 1; - }; - thrust::transform_exclusive_scan( - rmm::exec_policy(stream), begin, end + 1, d_offsets, map_fn, 0, thrust::plus{}); + auto map_fn = cuda::proclaim_return_type( + [d_strings, d_counts, max_tokens] __device__(auto idx) -> size_type { + return d_strings.is_null(idx) ? 0 : std::min(d_counts[idx], max_tokens) + 1; + }); - // the last offset entry is the total number of tokens to be generated - auto const total_tokens = cudf::detail::get_value(offsets, strings_count, stream); + auto const begin = cudf::detail::make_counting_transform_iterator(0, map_fn); + auto const end = begin + strings_count; - rmm::device_uvector tokens(total_tokens, stream); - if (total_tokens == 0) { return tokens; } - - launch_for_each_kernel(token_reader_fn{d_strings, direction, d_offsets, tokens.data()}, - d_prog, - d_strings.size(), - stream); + auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column( + begin, end, stream, rmm::mr::get_current_device_resource()); + auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); - return tokens; + // build a vector of tokens + rmm::device_uvector tokens(total_tokens, stream); + if (total_tokens > 0) { + auto tr_fn = token_reader_fn{d_strings, direction, d_offsets, tokens.data()}; + launch_for_each_kernel(tr_fn, d_prog, d_strings.size(), stream); + } + return std::pair(std::move(tokens), std::move(offsets)); } /** @@ -176,13 +169,13 @@ rmm::device_uvector generate_tokens(column_device_view const& struct tokens_transform_fn { column_device_view const d_strings; string_index_pair const* d_tokens; - size_type const* d_token_offsets; + cudf::detail::input_offsetalator const d_token_offsets; size_type const column_index; __device__ string_index_pair operator()(size_type idx) const { auto const offset = d_token_offsets[idx]; - auto const token_count = d_token_offsets[idx + 1] - offset; + auto const token_count = static_cast(d_token_offsets[idx + 1] - offset); return (column_index >= token_count) || d_strings.is_null(idx) ? string_index_pair{nullptr, 0} : d_tokens[offset + column_index]; @@ -212,13 +205,13 @@ std::unique_ptr
split_re(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); // count the number of delimiters matched in each string - auto offsets = count_matches( - *d_strings, *d_prog, strings_count + 1, stream, rmm::mr::get_current_device_resource()); - auto offsets_view = offsets->mutable_view(); - auto d_offsets = offsets_view.data(); + auto const counts = count_matches( + *d_strings, *d_prog, strings_count, stream, rmm::mr::get_current_device_resource()); // get the split tokens from the input column; this also converts the counts into offsets - auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); + auto [tokens, offsets] = + generate_tokens(*d_strings, *d_prog, direction, maxsplit, counts->view(), stream); + auto const d_offsets = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view()); // the output column count is the maximum number of tokens generated for any input string auto const columns_count = thrust::transform_reduce( @@ -226,7 +219,7 @@ std::unique_ptr
split_re(strings_column_view const& input, thrust::make_counting_iterator(0), thrust::make_counting_iterator(strings_count), [d_offsets] __device__(auto const idx) -> size_type { - return d_offsets[idx + 1] - d_offsets[idx]; + return static_cast(d_offsets[idx + 1] - d_offsets[idx]); }, 0, thrust::maximum{}); @@ -243,10 +236,11 @@ std::unique_ptr
split_re(strings_column_view const& input, } // convert the tokens into multiple strings columns + auto d_tokens = tokens.data(); auto make_strings_lambda = [&](size_type column_index) { // returns appropriate token for each row/column auto indices_itr = cudf::detail::make_counting_transform_iterator( - 0, tokens_transform_fn{*d_strings, tokens.data(), d_offsets, column_index}); + 0, tokens_transform_fn{*d_strings, d_tokens, d_offsets, column_index}); return make_strings_column(indices_itr, indices_itr + strings_count, stream, mr); }; // build a vector of columns @@ -276,11 +270,14 @@ std::unique_ptr split_record_re(strings_column_view const& input, auto d_strings = column_device_view::create(input.parent(), stream); // count the number of delimiters matched in each string - auto offsets = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr); - auto offsets_view = offsets->mutable_view(); + auto counts = count_matches(*d_strings, *d_prog, strings_count, stream, mr); // get the split tokens from the input column; this also converts the counts into offsets - auto tokens = generate_tokens(*d_strings, *d_prog, direction, maxsplit, offsets_view, stream); + auto [tokens, offsets] = + generate_tokens(*d_strings, *d_prog, direction, maxsplit, counts->view(), stream); + CUDF_EXPECTS(tokens.size() < static_cast(std::numeric_limits::max()), + "Size of output exceeds the column size limit", + std::overflow_error); // convert the tokens into one big strings column auto strings_output = make_strings_column(tokens.begin(), tokens.end(), stream, mr); diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu index 64061aba4fd..c9ed7b0ed26 100644 --- a/cpp/src/strings/split/split_record.cu +++ b/cpp/src/strings/split/split_record.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2023, NVIDIA CORPORATION. + * Copyright (c) 2020-2024, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -66,6 +66,9 @@ std::unique_ptr split_record_fn(strings_column_view const& input, // builds the offsets and the vector of all tokens auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr); + CUDF_EXPECTS(tokens.size() < static_cast(std::numeric_limits::max()), + "Size of output exceeds the column size limit", + std::overflow_error); // build a strings column from the tokens auto strings_child = make_strings_column(tokens.begin(), tokens.end(), stream, mr);