Skip to content

Commit

Permalink
Implement strings::repeat_strings (#8423)
Browse files Browse the repository at this point in the history
This PR implements `strings::repeat_strings` which repeats the given string(s) multiple times. In contrast with the existing API `cudf::repeat` that repeats the rows (copies one row into multiple rows), this new API repeats the string within each row of the given strings column (copies the content of each string multiple times into the output string). For example:
```
strs = ['aa', null, '',  'bbc']
out  = repeat_strings(strs, 3)
out is ['aaaaaa', null, '',  'bbcbbcbbc']
```

This implements cudf side API for NVIDIA/spark-rapids#68.

Authors:
  - Nghia Truong (https://github.com/ttnghia)

Approvers:
  - AJ Schmidt (https://github.com/ajschmidt8)
  - David Wendt (https://github.com/davidwendt)
  - Karthikeyan (https://github.com/karthikeyann)
  - Robert Maynard (https://github.com/robertmaynard)

URL: #8423
  • Loading branch information
ttnghia authored Jun 9, 2021
1 parent a9f15b8 commit 8df3c6a
Show file tree
Hide file tree
Showing 7 changed files with 539 additions and 0 deletions.
1 change: 1 addition & 0 deletions conda/recipes/libcudf/meta.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -195,6 +195,7 @@ test:
- test -f $PREFIX/include/cudf/strings/find_multiple.hpp
- test -f $PREFIX/include/cudf/strings/json.hpp
- test -f $PREFIX/include/cudf/strings/padding.hpp
- test -f $PREFIX/include/cudf/strings/repeat_strings.hpp
- test -f $PREFIX/include/cudf/strings/replace.hpp
- test -f $PREFIX/include/cudf/strings/replace_re.hpp
- test -f $PREFIX/include/cudf/strings/split/partition.hpp
Expand Down
1 change: 1 addition & 0 deletions cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -360,6 +360,7 @@ add_library(cudf
src/strings/json/json_path.cu
src/strings/regex/regcomp.cpp
src/strings/regex/regexec.cu
src/strings/repeat_strings.cu
src/strings/replace/backref_re.cu
src/strings/replace/backref_re_large.cu
src/strings/replace/backref_re_medium.cu
Expand Down
90 changes: 90 additions & 0 deletions cpp/include/cudf/strings/repeat_strings.hpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,90 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/
#pragma once

#include <cudf/scalar/scalar.hpp>
#include <cudf/strings/strings_column_view.hpp>

namespace cudf {
namespace strings {
/**
* @addtogroup strings_copy
* @{
* @file
* @brief Strings APIs for copying strings.
*/

/**
* @brief Repeat the given string scalar by a given number of times.
*
* For a given string scalar, an output string scalar is generated by repeating the input string by
* a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not a positve
* value, an empty (valid) string scalar will be returned. An invalid input scalar will always
* result in an invalid output scalar regardless of the value of `repeat_times` parameter.
*
* @code{.pseudo}
* Example:
* s = '123XYZ-'
* out = repeat_strings(s, 3)
* out is '123XYZ-123XYZ-123XYZ-'
* @endcode
*
* @throw cudf::logic_error if the size of the ouput string scalar exceeds the maximum value that
* can be stored by the index type
* (i.e., `input.size() * repeat_times > numeric_limits<size_type>::max()`).
*
* @param input The scalar containing the string to repeat.
* @param repeat_times The number of times the `input` string is copied to the output.
* @param mr Device memory resource used to allocate the returned string scalar.
* @return New string scalar in which the string is repeated from the input.
*/
std::unique_ptr<string_scalar> repeat_strings(
string_scalar const& input,
size_type repeat_times,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/**
* @brief Repeat each string in the given strings column by a given number of times.
*
* For a given strings column, an output strings column is generated by repeating each string from
* the input by a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not
* a positve value, all the rows of the output strings column will be an empty string. Any null row
* will result in a null row regardless of the value of `repeat_times` parameter.
*
* Note that this function cannot handle the cases when the size of the output column exceeds the
* maximum value that can be indexed by size_type (offset_type). In such situations, an exception
* may be thrown, or the output result is undefined.
*
* @code{.pseudo}
* Example:
* strs = ['aa', null, '', 'bbc']
* out = repeat_strings(strs, 3)
* out is ['aaaaaa', null, '', 'bbcbbcbbc']
* @endcode
*
* @param input The column containing strings to repeat.
* @param repeat_times The number of times each input string is copied to the output.
* @param mr Device memory resource used to allocate the returned strings column.
* @return New column with concatenated results.
*/
std::unique_ptr<column> repeat_strings(
strings_column_view const& input,
size_type repeat_times,
rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());

/** @} */ // end of doxygen group
} // namespace strings
} // namespace cudf
1 change: 1 addition & 0 deletions cpp/include/doxygen_groups.h
Original file line number Diff line number Diff line change
Expand Up @@ -122,6 +122,7 @@
* @defgroup strings_combine Combining
* @defgroup strings_contains Searching
* @defgroup strings_convert Converting
* @defgroup strings_copy Copying
* @defgroup strings_substring Substring
* @defgroup strings_find Finding
* @defgroup strings_modify Modifying
Expand Down
194 changes: 194 additions & 0 deletions cpp/src/strings/repeat_strings.cu
Original file line number Diff line number Diff line change
@@ -0,0 +1,194 @@
/*
* Copyright (c) 2021, NVIDIA CORPORATION.
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

#include <cudf/column/column_device_view.cuh>
#include <cudf/column/column_factories.hpp>
#include <cudf/detail/null_mask.hpp>
#include <cudf/detail/nvtx/ranges.hpp>
#include <cudf/strings/detail/utilities.cuh>
#include <cudf/strings/detail/utilities.hpp>
#include <cudf/strings/repeat_strings.hpp>
#include <cudf/strings/strings_column_view.hpp>
#include <cudf/utilities/error.hpp>

#include <rmm/cuda_stream_view.hpp>

#include <thrust/transform.h>

namespace cudf {
namespace strings {
namespace detail {

std::unique_ptr<string_scalar> repeat_strings(string_scalar const& input,
size_type repeat_times,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
if (!input.is_valid(stream)) { return std::make_unique<string_scalar>("", false, stream, mr); }
if (input.size() == 0 || repeat_times <= 0) {
return std::make_unique<string_scalar>("", true, stream, mr);
}
if (repeat_times == 1) { return std::make_unique<string_scalar>(input, stream, mr); }

CUDF_EXPECTS(input.size() <= std::numeric_limits<size_type>::max() / repeat_times,
"The output string has size that exceeds the maximum allowed size.");

auto const str_size = input.size();
auto const iter = thrust::make_counting_iterator(0);
auto buff = rmm::device_buffer(repeat_times * input.size(), stream, mr);

// Pull data from the input string into each byte of the output string.
thrust::transform(rmm::exec_policy(stream),
iter,
iter + repeat_times * str_size,
static_cast<char*>(buff.data()),
[in_ptr = input.data(), str_size] __device__(const auto idx) {
return in_ptr[idx % str_size];
});

return std::make_unique<string_scalar>(std::move(buff));
}

namespace {
/**
* @brief Generate a strings column in which each row is an empty or null string.
*
* The output strings column has the same bitmask as the input column.
*/
auto generate_empty_output(strings_column_view const& input,
size_type strings_count,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto chars_column = create_chars_child_column(strings_count, 0, stream, mr);

auto offsets_column = make_numeric_column(
data_type{type_to_id<offset_type>()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data<offset_type>(),
0,
offsets_column->size() * sizeof(offset_type),
stream.value()));

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
stream,
mr);
}

/**
* @brief Functor to compute string sizes and repeat the input strings.
*
* This functor is called only when `repeat_times > 0`. In addition, the total number of threads
* running this functor is `repeat_times * strings_count` (instead of `string_count`) for maximizing
* parallelism and better load-balancing.
*/
struct compute_size_and_repeat_fn {
column_device_view const strings_dv;
size_type const repeat_times;
bool const has_nulls;

offset_type* d_offsets{nullptr};

// If d_chars == nullptr: only compute sizes of the output strings.
// If d_chars != nullptr: only repeat strings.
char* d_chars{nullptr};

// `idx` will be in the range of [0, repeat_times * strings_count).
__device__ void operator()(size_type const idx) const noexcept
{
auto const str_idx = idx / repeat_times; // value cycles in [0, string_count)
auto const repeat_idx = idx % repeat_times; // value cycles in [0, repeat_times)
auto const is_valid = !has_nulls || strings_dv.is_valid_nocheck(str_idx);

if (!d_chars && repeat_idx == 0) {
d_offsets[str_idx] =
is_valid ? repeat_times * strings_dv.element<string_view>(str_idx).size_bytes() : 0;
}

// Each input string will be copied by `repeat_times` threads into the output string.
if (d_chars && is_valid) {
auto const d_str = strings_dv.element<string_view>(str_idx);
auto const str_size = d_str.size_bytes();
if (str_size > 0) {
auto const input_ptr = d_str.data();
auto const output_ptr = d_chars + d_offsets[str_idx] + repeat_idx * str_size;
std::memcpy(output_ptr, input_ptr, str_size);
}
}
}
};

} // namespace

std::unique_ptr<column> repeat_strings(strings_column_view const& input,
size_type repeat_times,
rmm::cuda_stream_view stream,
rmm::mr::device_memory_resource* mr)
{
auto const strings_count = input.size();
if (strings_count == 0) { return make_empty_column(data_type{type_id::STRING}); }

if (repeat_times <= 0) {
// If the number of repetitions is not positive, each row of the output strings column will be
// either an empty string (if the input row is not null), or a null (if the input row is null).
return generate_empty_output(input, strings_count, stream, mr);
}

// If `repeat_times == 1`, just make a copy of the input.
if (repeat_times == 1) { return std::make_unique<column>(input.parent(), stream, mr); }

auto const strings_dv_ptr = column_device_view::create(input.parent(), stream);
auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()};

// Repeat the strings in each row.
// Note that this cannot handle the cases when the size of the output column exceeds the maximum
// value that can be indexed by size_type (offset_type).
// In such situations, an exception may be thrown, or the output result is undefined.
auto [offsets_column, chars_column] =
make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr);

return make_strings_column(strings_count,
std::move(offsets_column),
std::move(chars_column),
input.null_count(),
cudf::detail::copy_bitmask(input.parent(), stream, mr),
stream,
mr);
}

} // namespace detail

std::unique_ptr<string_scalar> repeat_strings(string_scalar const& input,
size_type repeat_times,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::repeat_strings(input, repeat_times, rmm::cuda_stream_default, mr);
}

std::unique_ptr<column> repeat_strings(strings_column_view const& input,
size_type repeat_times,
rmm::mr::device_memory_resource* mr)
{
CUDF_FUNC_RANGE();
return detail::repeat_strings(input, repeat_times, rmm::cuda_stream_default, mr);
}

} // namespace strings
} // namespace cudf
1 change: 1 addition & 0 deletions cpp/tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -351,6 +351,7 @@ ConfigureTest(STRINGS_TEST
strings/ipv4_tests.cpp
strings/json_tests.cpp
strings/pad_tests.cpp
strings/repeat_strings_tests.cpp
strings/replace_regex_tests.cpp
strings/replace_tests.cpp
strings/split_tests.cpp
Expand Down
Loading

0 comments on commit 8df3c6a

Please sign in to comment.