Use offsetalator in cudf::strings::split functions (#14757)

Adds offsetalator in place of hardcoded offset type arrays to the strings split functions: - `cudf::strings::split()` - `cudf::strings::rsplit()` - `cudf::strings::split_record()` - `cudf::strings::rsplit_record()` - `cudf::strings::split_re()` - `cudf::strings::rsplit_re()` - `cudf::strings::split_record_re()` - `cudf::strings::rsplit_record_re()` Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Mike Wilson (https://github.com/hyperbolic2346) URL: #14757
rapidsai · Feb 8, 2024 · 47d28a0 · 47d28a0
1 parent 03f63ec
commit 47d28a0
Show file tree

Hide file tree

Showing 4 changed files with 112 additions and 112 deletions.
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -19,7 +19,6 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/strings/detail/split_utils.cuh>
@@ -123,7 +122,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,
 
   // builds the offsets and the vector of all tokens
   auto [offsets, tokens] = split_helper(input, tokenizer, stream, mr);
-  auto const d_offsets   = offsets->view().template data<size_type>();
+  auto const d_offsets   = cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
   auto const d_tokens    = tokens.data();
 
   // compute the maximum number of tokens for any string
@@ -132,7 +131,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,
     thrust::make_counting_iterator<size_type>(0),
     thrust::make_counting_iterator<size_type>(input.size()),
     cuda::proclaim_return_type<size_type>([d_offsets] __device__(auto idx) -> size_type {
-      return d_offsets[idx + 1] - d_offsets[idx];
+      return static_cast<size_type>(d_offsets[idx + 1] - d_offsets[idx]);
     }),
     0,
     thrust::maximum{});
@@ -144,7 +143,7 @@ std::unique_ptr<table> split_fn(strings_column_view const& input,
       cuda::proclaim_return_type<string_index_pair>(
         [d_tokens, d_offsets, col] __device__(size_type idx) {
           auto const offset      = d_offsets[idx];
-          auto const token_count = d_offsets[idx + 1] - offset;
+          auto const token_count = static_cast<size_type>(d_offsets[idx + 1] - offset);
           return (col < token_count) ? d_tokens[offset + col] : string_index_pair{nullptr, 0};
         }));
     results.emplace_back(make_strings_column(itr, itr + input.size(), stream, mr));
@@ -360,12 +359,11 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
   }
 
   // get the positions for every token
-  rmm::device_uvector<string_index_pair> tokens(columns_count * strings_count, stream);
+  rmm::device_uvector<string_index_pair> tokens(
+    static_cast<int64_t>(columns_count) * static_cast<int64_t>(strings_count), stream);
   string_index_pair* d_tokens = tokens.data();
-  thrust::fill(rmm::exec_policy(stream),
-               d_tokens,
-               d_tokens + (columns_count * strings_count),
-               string_index_pair{nullptr, 0});
+  thrust::fill(
+    rmm::exec_policy(stream), tokens.begin(), tokens.end(), string_index_pair{nullptr, 0});
   thrust::for_each_n(rmm::exec_policy(stream),
                      thrust::make_counting_iterator<size_type>(0),
                      strings_count,

diff --git a/cpp/src/strings/split/split.cuh b/cpp/src/strings/split/split.cuh
@@ -17,9 +17,9 @@
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/get_value.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sizes_to_offsets_iterator.cuh>
+#include <cudf/detail/utilities/algorithm.cuh>
 #include <cudf/strings/detail/split_utils.cuh>
 #include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/string_view.cuh>
@@ -66,9 +66,9 @@ struct base_split_tokenizer {
    * @param chars_bytes Total number of characters to process
    * @return true if delimiter is found starting at position `idx`
    */
-  __device__ bool is_delimiter(size_type idx,
-                               size_type const* d_offsets,
-                               size_type chars_bytes) const
+  __device__ bool is_delimiter(int64_t idx,
+                               cudf::detail::input_offsetalator const d_offsets,
+                               int64_t chars_bytes) const
   {
     auto const d_chars = get_base_ptr() + d_offsets[0];
     if (idx + d_delimiter.size_bytes() > chars_bytes) { return false; }
@@ -87,21 +87,22 @@ struct base_split_tokenizer {
    * @param d_delimiter_offsets Offsets per string to delimiters in d_positions
    */
   __device__ size_type count_tokens(size_type idx,
-                                    size_type const* d_positions,
-                                    size_type const* d_delimiter_offsets) const
+                                    int64_t const* d_positions,
+                                    int64_t const* d_delimiter_offsets) const
   {
     if (!is_valid(idx)) { return 0; }
 
     auto const delim_size = d_delimiter.size_bytes();
     auto const d_str      = get_string(idx);
     auto const d_str_end  = d_str.data() + d_str.size_bytes();
     auto const base_ptr   = get_base_ptr() + delim_size - 1;
+
     auto const delimiters =
-      cudf::device_span<size_type const>(d_positions + d_delimiter_offsets[idx],
-                                         d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
+      cudf::device_span<int64_t const>(d_positions + d_delimiter_offsets[idx],
+                                       d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
 
     size_type token_count = 1;  // all strings will have at least one token
-    size_type last_pos    = delimiters[0] - delim_size;
+    auto last_pos         = !delimiters.empty() ? (delimiters[0] - delim_size) : 0L;
     for (auto d_pos : delimiters) {
       // delimiter must fit in string && overlapping delimiters are ignored
       if (((base_ptr + d_pos) < d_str_end) && ((d_pos - last_pos) >= delim_size)) {
@@ -129,9 +130,9 @@ struct base_split_tokenizer {
    * @param d_all_tokens All output tokens for the strings column
    */
   __device__ void get_tokens(size_type idx,
-                             size_type const* d_tokens_offsets,
-                             size_type const* d_positions,
-                             size_type const* d_delimiter_offsets,
+                             cudf::detail::input_offsetalator const d_tokens_offsets,
+                             int64_t const* d_positions,
+                             int64_t const* d_delimiter_offsets,
                              string_index_pair* d_all_tokens) const
   {
     auto const d_tokens =  // this string's tokens output
@@ -149,8 +150,8 @@ struct base_split_tokenizer {
     }
 
     auto const delimiters =
-      cudf::device_span<size_type const>(d_positions + d_delimiter_offsets[idx],
-                                         d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
+      cudf::device_span<int64_t const>(d_positions + d_delimiter_offsets[idx],
+                                       d_delimiter_offsets[idx + 1] - d_delimiter_offsets[idx]);
 
     auto& derived = static_cast<Derived const&>(*this);
     derived.process_tokens(d_str, delimiters, d_tokens);
@@ -184,7 +185,7 @@ struct split_tokenizer_fn : base_split_tokenizer<split_tokenizer_fn> {
    * @param d_tokens Output vector to store tokens for this string
    */
   __device__ void process_tokens(string_view const d_str,
-                                 device_span<size_type const> d_delimiters,
+                                 device_span<int64_t const> d_delimiters,
                                  device_span<string_index_pair> d_tokens) const
   {
     auto const base_ptr    = get_base_ptr();  // d_positions values based on this
@@ -239,7 +240,7 @@ struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
    * @param d_tokens Output vector to store tokens for this string
    */
   __device__ void process_tokens(string_view const d_str,
-                                 device_span<size_type const> d_delimiters,
+                                 device_span<int64_t const> d_delimiters,
                                  device_span<string_index_pair> d_tokens) const
   {
     auto const base_ptr    = get_base_ptr();  // d_positions values are based on this ptr
@@ -290,7 +291,8 @@ struct rsplit_tokenizer_fn : base_split_tokenizer<rsplit_tokenizer_fn> {
  * @param input The input column of strings to split
  * @param tokenizer Object used for counting and identifying delimiters and tokens
  * @param stream CUDA stream used for device memory operations and kernel launches
- * @param mr Device memory resource used to allocate the returned objects' device memory.
+ * @param mr Device memory resource used to allocate the returned objects' device memory
+ * @return Token offsets and a vector of string indices
  */
 template <typename Tokenizer>
 std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split_helper(
@@ -301,37 +303,38 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
 {
   auto const strings_count = input.size();
   auto const chars_bytes =
-    cudf::detail::get_value<size_type>(input.offsets(), input.offset() + strings_count, stream) -
-    cudf::detail::get_value<size_type>(input.offsets(), input.offset(), stream);
-
-  auto d_offsets = input.offsets_begin();
+    get_offset_value(input.offsets(), input.offset() + strings_count, stream) -
+    get_offset_value(input.offsets(), input.offset(), stream);
+  auto const d_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(input.offsets(), input.offset());
 
   // count the number of delimiters in the entire column
   auto const delimiter_count =
     thrust::count_if(rmm::exec_policy(stream),
-                     thrust::make_counting_iterator<size_type>(0),
-                     thrust::make_counting_iterator<size_type>(chars_bytes),
-                     [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
+                     thrust::counting_iterator<int64_t>(0),
+                     thrust::counting_iterator<int64_t>(chars_bytes),
+                     [tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) {
                        return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
                      });
+
   // Create a vector of every delimiter position in the chars column.
   // These may include overlapping or otherwise out-of-bounds delimiters which
   // will be resolved during token processing.
-  auto delimiter_positions = rmm::device_uvector<size_type>(delimiter_count, stream);
+  auto delimiter_positions = rmm::device_uvector<int64_t>(delimiter_count, stream);
   auto d_positions         = delimiter_positions.data();
-  auto const copy_end =
-    thrust::copy_if(rmm::exec_policy(stream),
-                    thrust::make_counting_iterator<size_type>(0),
-                    thrust::make_counting_iterator<size_type>(chars_bytes),
-                    delimiter_positions.begin(),
-                    [tokenizer, d_offsets, chars_bytes] __device__(size_type idx) {
-                      return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
-                    });
+  auto const copy_end      = cudf::detail::copy_if_safe(
+    thrust::counting_iterator<int64_t>(0),
+    thrust::counting_iterator<int64_t>(chars_bytes),
+    delimiter_positions.begin(),
+    [tokenizer, d_offsets, chars_bytes] __device__(int64_t idx) {
+      return tokenizer.is_delimiter(idx, d_offsets, chars_bytes);
+    },
+    stream);
 
   // create a vector of offsets to each string's delimiter set within delimiter_positions
   auto const delimiter_offsets = [&] {
     // first, create a vector of string indices for each delimiter
-    auto string_indices = rmm::device_uvector<size_type>(delimiter_count, stream);
+    auto string_indices = rmm::device_uvector<int64_t>(delimiter_count, stream);
     thrust::upper_bound(rmm::exec_policy(stream),
                         d_offsets,
                         d_offsets + strings_count,
@@ -340,24 +343,24 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
                         string_indices.begin());
 
     // compute delimiter offsets per string
-    auto delimiter_offsets   = rmm::device_uvector<size_type>(strings_count + 1, stream);
+    auto delimiter_offsets   = rmm::device_uvector<int64_t>(strings_count + 1, stream);
     auto d_delimiter_offsets = delimiter_offsets.data();
 
     // memset to zero-out the delimiter counts for any null-entries or strings with no delimiters
     CUDF_CUDA_TRY(cudaMemsetAsync(
-      d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(size_type), stream.value()));
+      d_delimiter_offsets, 0, delimiter_offsets.size() * sizeof(int64_t), stream.value()));
 
     // next, count the number of delimiters per string
     auto d_string_indices = string_indices.data();  // identifies strings with delimiters only
-    thrust::for_each_n(rmm::exec_policy(stream),
-                       thrust::make_counting_iterator<size_type>(0),
-                       delimiter_count,
-                       [d_string_indices, d_delimiter_offsets] __device__(size_type idx) {
-                         auto const str_idx = d_string_indices[idx] - 1;
-                         cuda::atomic_ref<size_type, cuda::thread_scope_device> ref{
-                           *(d_delimiter_offsets + str_idx)};
-                         ref.fetch_add(1, cuda::std::memory_order_relaxed);
-                       });
+    thrust::for_each_n(
+      rmm::exec_policy(stream),
+      thrust::counting_iterator<int64_t>(0),
+      delimiter_count,
+      [d_string_indices, d_delimiter_offsets] __device__(int64_t idx) {
+        auto const str_idx = d_string_indices[idx] - 1;
+        cuda::atomic_ref<int64_t, cuda::thread_scope_device> ref{*(d_delimiter_offsets + str_idx)};
+        ref.fetch_add(1L, cuda::std::memory_order_relaxed);
+      });
     // finally, convert the delimiter counts into offsets
     thrust::exclusive_scan(rmm::exec_policy(stream),
                            delimiter_offsets.begin(),
@@ -379,11 +382,10 @@ std::pair<std::unique_ptr<column>, rmm::device_uvector<string_index_pair>> split
     });
 
   // create offsets from the counts for return to the caller
-  auto offsets = std::get<0>(
-    cudf::detail::make_offsets_child_column(token_counts.begin(), token_counts.end(), stream, mr));
-  auto const total_tokens =
-    cudf::detail::get_value<size_type>(offsets->view(), strings_count, stream);
-  auto const d_tokens_offsets = offsets->view().data<size_type>();
+  auto [offsets, total_tokens] = cudf::strings::detail::make_offsets_child_column(
+    token_counts.begin(), token_counts.end(), stream, mr);
+  auto const d_tokens_offsets =
+    cudf::detail::offsetalator_factory::make_input_iterator(offsets->view());
 
   // build a vector of all the token positions for all the strings
   auto tokens   = rmm::device_uvector<string_index_pair>(total_tokens, stream);