From 8df3c6a960d0c51d868664c75e848d71dac3f0b3 Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 9 Jun 2021 08:38:28 -0600 Subject: [PATCH] Implement `strings::repeat_strings` (#8423) This PR implements `strings::repeat_strings` which repeats the given string(s) multiple times. In contrast with the existing API `cudf::repeat` that repeats the rows (copies one row into multiple rows), this new API repeats the string within each row of the given strings column (copies the content of each string multiple times into the output string). For example: ``` strs = ['aa', null, '', 'bbc'] out = repeat_strings(strs, 3) out is ['aaaaaa', null, '', 'bbcbbcbbc'] ``` This implements cudf side API for https://github.com/NVIDIA/spark-rapids/issues/68. Authors: - Nghia Truong (https://github.com/ttnghia) Approvers: - AJ Schmidt (https://github.com/ajschmidt8) - David Wendt (https://github.com/davidwendt) - Karthikeyan (https://github.com/karthikeyann) - Robert Maynard (https://github.com/robertmaynard) URL: https://github.com/rapidsai/cudf/pull/8423 --- conda/recipes/libcudf/meta.yaml | 1 + cpp/CMakeLists.txt | 1 + cpp/include/cudf/strings/repeat_strings.hpp | 90 +++++++ cpp/include/doxygen_groups.h | 1 + cpp/src/strings/repeat_strings.cu | 194 +++++++++++++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/strings/repeat_strings_tests.cpp | 251 ++++++++++++++++++++ 7 files changed, 539 insertions(+) create mode 100644 cpp/include/cudf/strings/repeat_strings.hpp create mode 100644 cpp/src/strings/repeat_strings.cu create mode 100644 cpp/tests/strings/repeat_strings_tests.cpp diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml index 139ceb1d6af..77d400b5d44 100644 --- a/conda/recipes/libcudf/meta.yaml +++ b/conda/recipes/libcudf/meta.yaml @@ -195,6 +195,7 @@ test: - test -f $PREFIX/include/cudf/strings/find_multiple.hpp - test -f $PREFIX/include/cudf/strings/json.hpp - test -f $PREFIX/include/cudf/strings/padding.hpp + - test -f $PREFIX/include/cudf/strings/repeat_strings.hpp - test -f $PREFIX/include/cudf/strings/replace.hpp - test -f $PREFIX/include/cudf/strings/replace_re.hpp - test -f $PREFIX/include/cudf/strings/split/partition.hpp diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index abfaeba86c3..090f613a9d1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -360,6 +360,7 @@ add_library(cudf src/strings/json/json_path.cu src/strings/regex/regcomp.cpp src/strings/regex/regexec.cu + src/strings/repeat_strings.cu src/strings/replace/backref_re.cu src/strings/replace/backref_re_large.cu src/strings/replace/backref_re_medium.cu diff --git a/cpp/include/cudf/strings/repeat_strings.hpp b/cpp/include/cudf/strings/repeat_strings.hpp new file mode 100644 index 00000000000..4023dbc6c84 --- /dev/null +++ b/cpp/include/cudf/strings/repeat_strings.hpp @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include +#include + +namespace cudf { +namespace strings { +/** + * @addtogroup strings_copy + * @{ + * @file + * @brief Strings APIs for copying strings. + */ + +/** + * @brief Repeat the given string scalar by a given number of times. + * + * For a given string scalar, an output string scalar is generated by repeating the input string by + * a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not a positve + * value, an empty (valid) string scalar will be returned. An invalid input scalar will always + * result in an invalid output scalar regardless of the value of `repeat_times` parameter. + * + * @code{.pseudo} + * Example: + * s = '123XYZ-' + * out = repeat_strings(s, 3) + * out is '123XYZ-123XYZ-123XYZ-' + * @endcode + * + * @throw cudf::logic_error if the size of the ouput string scalar exceeds the maximum value that + * can be stored by the index type + * (i.e., `input.size() * repeat_times > numeric_limits::max()`). + * + * @param input The scalar containing the string to repeat. + * @param repeat_times The number of times the `input` string is copied to the output. + * @param mr Device memory resource used to allocate the returned string scalar. + * @return New string scalar in which the string is repeated from the input. + */ +std::unique_ptr repeat_strings( + string_scalar const& input, + size_type repeat_times, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** + * @brief Repeat each string in the given strings column by a given number of times. + * + * For a given strings column, an output strings column is generated by repeating each string from + * the input by a number of times given by the @p `repeat_times` parameter. If `repeat_times` is not + * a positve value, all the rows of the output strings column will be an empty string. Any null row + * will result in a null row regardless of the value of `repeat_times` parameter. + * + * Note that this function cannot handle the cases when the size of the output column exceeds the + * maximum value that can be indexed by size_type (offset_type). In such situations, an exception + * may be thrown, or the output result is undefined. + * + * @code{.pseudo} + * Example: + * strs = ['aa', null, '', 'bbc'] + * out = repeat_strings(strs, 3) + * out is ['aaaaaa', null, '', 'bbcbbcbbc'] + * @endcode + * + * @param input The column containing strings to repeat. + * @param repeat_times The number of times each input string is copied to the output. + * @param mr Device memory resource used to allocate the returned strings column. + * @return New column with concatenated results. + */ +std::unique_ptr repeat_strings( + strings_column_view const& input, + size_type repeat_times, + rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); + +/** @} */ // end of doxygen group +} // namespace strings +} // namespace cudf diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h index dda8ce87432..5dbf5377396 100644 --- a/cpp/include/doxygen_groups.h +++ b/cpp/include/doxygen_groups.h @@ -122,6 +122,7 @@ * @defgroup strings_combine Combining * @defgroup strings_contains Searching * @defgroup strings_convert Converting + * @defgroup strings_copy Copying * @defgroup strings_substring Substring * @defgroup strings_find Finding * @defgroup strings_modify Modifying diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu new file mode 100644 index 00000000000..79ea94e8a06 --- /dev/null +++ b/cpp/src/strings/repeat_strings.cu @@ -0,0 +1,194 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#include + +namespace cudf { +namespace strings { +namespace detail { + +std::unique_ptr repeat_strings(string_scalar const& input, + size_type repeat_times, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + if (!input.is_valid(stream)) { return std::make_unique("", false, stream, mr); } + if (input.size() == 0 || repeat_times <= 0) { + return std::make_unique("", true, stream, mr); + } + if (repeat_times == 1) { return std::make_unique(input, stream, mr); } + + CUDF_EXPECTS(input.size() <= std::numeric_limits::max() / repeat_times, + "The output string has size that exceeds the maximum allowed size."); + + auto const str_size = input.size(); + auto const iter = thrust::make_counting_iterator(0); + auto buff = rmm::device_buffer(repeat_times * input.size(), stream, mr); + + // Pull data from the input string into each byte of the output string. + thrust::transform(rmm::exec_policy(stream), + iter, + iter + repeat_times * str_size, + static_cast(buff.data()), + [in_ptr = input.data(), str_size] __device__(const auto idx) { + return in_ptr[idx % str_size]; + }); + + return std::make_unique(std::move(buff)); +} + +namespace { +/** + * @brief Generate a strings column in which each row is an empty or null string. + * + * The output strings column has the same bitmask as the input column. + */ +auto generate_empty_output(strings_column_view const& input, + size_type strings_count, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto chars_column = create_chars_child_column(strings_count, 0, stream, mr); + + auto offsets_column = make_numeric_column( + data_type{type_to_id()}, strings_count + 1, mask_state::UNALLOCATED, stream, mr); + CUDA_TRY(cudaMemsetAsync(offsets_column->mutable_view().template data(), + 0, + offsets_column->size() * sizeof(offset_type), + stream.value())); + + return make_strings_column(strings_count, + std::move(offsets_column), + std::move(chars_column), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} + +/** + * @brief Functor to compute string sizes and repeat the input strings. + * + * This functor is called only when `repeat_times > 0`. In addition, the total number of threads + * running this functor is `repeat_times * strings_count` (instead of `string_count`) for maximizing + * parallelism and better load-balancing. + */ +struct compute_size_and_repeat_fn { + column_device_view const strings_dv; + size_type const repeat_times; + bool const has_nulls; + + offset_type* d_offsets{nullptr}; + + // If d_chars == nullptr: only compute sizes of the output strings. + // If d_chars != nullptr: only repeat strings. + char* d_chars{nullptr}; + + // `idx` will be in the range of [0, repeat_times * strings_count). + __device__ void operator()(size_type const idx) const noexcept + { + auto const str_idx = idx / repeat_times; // value cycles in [0, string_count) + auto const repeat_idx = idx % repeat_times; // value cycles in [0, repeat_times) + auto const is_valid = !has_nulls || strings_dv.is_valid_nocheck(str_idx); + + if (!d_chars && repeat_idx == 0) { + d_offsets[str_idx] = + is_valid ? repeat_times * strings_dv.element(str_idx).size_bytes() : 0; + } + + // Each input string will be copied by `repeat_times` threads into the output string. + if (d_chars && is_valid) { + auto const d_str = strings_dv.element(str_idx); + auto const str_size = d_str.size_bytes(); + if (str_size > 0) { + auto const input_ptr = d_str.data(); + auto const output_ptr = d_chars + d_offsets[str_idx] + repeat_idx * str_size; + std::memcpy(output_ptr, input_ptr, str_size); + } + } + } +}; + +} // namespace + +std::unique_ptr repeat_strings(strings_column_view const& input, + size_type repeat_times, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) +{ + auto const strings_count = input.size(); + if (strings_count == 0) { return make_empty_column(data_type{type_id::STRING}); } + + if (repeat_times <= 0) { + // If the number of repetitions is not positive, each row of the output strings column will be + // either an empty string (if the input row is not null), or a null (if the input row is null). + return generate_empty_output(input, strings_count, stream, mr); + } + + // If `repeat_times == 1`, just make a copy of the input. + if (repeat_times == 1) { return std::make_unique(input.parent(), stream, mr); } + + auto const strings_dv_ptr = column_device_view::create(input.parent(), stream); + auto const fn = compute_size_and_repeat_fn{*strings_dv_ptr, repeat_times, input.has_nulls()}; + + // Repeat the strings in each row. + // Note that this cannot handle the cases when the size of the output column exceeds the maximum + // value that can be indexed by size_type (offset_type). + // In such situations, an exception may be thrown, or the output result is undefined. + auto [offsets_column, chars_column] = + make_strings_children(fn, strings_count * repeat_times, strings_count, stream, mr); + + return make_strings_column(strings_count, + std::move(offsets_column), + std::move(chars_column), + input.null_count(), + cudf::detail::copy_bitmask(input.parent(), stream, mr), + stream, + mr); +} + +} // namespace detail + +std::unique_ptr repeat_strings(string_scalar const& input, + size_type repeat_times, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::repeat_strings(input, repeat_times, rmm::cuda_stream_default, mr); +} + +std::unique_ptr repeat_strings(strings_column_view const& input, + size_type repeat_times, + rmm::mr::device_memory_resource* mr) +{ + CUDF_FUNC_RANGE(); + return detail::repeat_strings(input, repeat_times, rmm::cuda_stream_default, mr); +} + +} // namespace strings +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 6a8e36e6cf6..fdaeb3ebdab 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -351,6 +351,7 @@ ConfigureTest(STRINGS_TEST strings/ipv4_tests.cpp strings/json_tests.cpp strings/pad_tests.cpp + strings/repeat_strings_tests.cpp strings/replace_regex_tests.cpp strings/replace_tests.cpp strings/split_tests.cpp diff --git a/cpp/tests/strings/repeat_strings_tests.cpp b/cpp/tests/strings/repeat_strings_tests.cpp new file mode 100644 index 00000000000..c3e67e86f1b --- /dev/null +++ b/cpp/tests/strings/repeat_strings_tests.cpp @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include + +#include +#include +#include + +namespace { +using STR_COL = cudf::test::strings_column_wrapper; + +constexpr bool print_all{false}; + +auto all_nulls() { return cudf::test::iterator_all_nulls(); } + +auto null_at(cudf::size_type idx) { return cudf::test::iterator_with_null_at(idx); } + +auto null_at(std::vector const& indices) +{ + return cudf::test::iterator_with_null_at(cudf::host_span{indices}); +} + +} // namespace + +struct RepeatJoinStringTest : public cudf::test::BaseFixture { +}; + +TEST_F(RepeatJoinStringTest, InvalidStringScalar) +{ + auto const str = cudf::string_scalar("", false); + auto const result = cudf::strings::repeat_strings(str, 3); + EXPECT_EQ(result->is_valid(), false); +} + +TEST_F(RepeatJoinStringTest, ZeroSizeStringScalar) +{ + auto const str = cudf::string_scalar(""); + auto const result = cudf::strings::repeat_strings(str, 3); + EXPECT_EQ(result->is_valid(), true); + EXPECT_EQ(result->size(), 0); +} + +TEST_F(RepeatJoinStringTest, ValidStringScalar) +{ + auto const str = cudf::string_scalar("abc123xyz-"); + + { + auto const result = cudf::strings::repeat_strings(str, 3); + auto const expected = cudf::string_scalar("abc123xyz-abc123xyz-abc123xyz-"); + CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), result->data(), expected.size()); + } + + // Repeat once. + { + auto const result = cudf::strings::repeat_strings(str, 1); + CUDF_TEST_EXPECT_EQUAL_BUFFERS(str.data(), result->data(), str.size()); + } + + // Zero repeat times. + { + auto const result = cudf::strings::repeat_strings(str, 0); + EXPECT_EQ(result->is_valid(), true); + EXPECT_EQ(result->size(), 0); + } + + // Negatitve repeat times. + { + auto const result = cudf::strings::repeat_strings(str, -10); + EXPECT_EQ(result->is_valid(), true); + EXPECT_EQ(result->size(), 0); + } + + // Repeat too many times. + { + EXPECT_THROW(cudf::strings::repeat_strings(str, std::numeric_limits::max() / 2), + cudf::logic_error); + } +} + +TEST_F(RepeatJoinStringTest, ZeroSizeStringsColumn) +{ + auto const strs = STR_COL{}; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 10); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, print_all); +} + +TEST_F(RepeatJoinStringTest, AllEmptyStringsColumn) +{ + auto const strs = STR_COL{"", "", "", "", ""}; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 10); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, print_all); +} + +TEST_F(RepeatJoinStringTest, AllNullStringsColumn) +{ + auto const strs = STR_COL{{"" /*NULL*/, "" /*NULL*/, "" /*NULL*/}, all_nulls()}; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 10); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, print_all); +} + +TEST_F(RepeatJoinStringTest, ZeroSizeAndNullStringsColumn) +{ + auto const strs = + STR_COL{{"" /*NULL*/, "", "" /*NULL*/, "", "", "" /*NULL*/}, null_at({0, 2, 5})}; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 10); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, print_all); +} + +TEST_F(RepeatJoinStringTest, StringsColumnNoNull) +{ + auto const strs = STR_COL{"0a0b0c", "abcxyz", "xyzééé", "ááá", "íí"}; + + { + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 2); + auto const expected = STR_COL{"0a0b0c0a0b0c", "abcxyzabcxyz", "xyzéééxyzééé", "áááááá", "íííí"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Repeat once. + { + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, print_all); + } + + // Non-positive repeat times. + { + auto const expected = STR_COL{"", "", "", "", ""}; + + auto results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + + results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), -100); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Sliced the first half of the column. + { + auto const sliced_strs = cudf::slice(strs, {0, 3})[0]; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected = STR_COL{"0a0b0c0a0b0c", "abcxyzabcxyz", "xyzéééxyzééé"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Sliced the middle of the column. + { + auto const sliced_strs = cudf::slice(strs, {1, 3})[0]; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected = STR_COL{"abcxyzabcxyz", "xyzéééxyzééé"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Sliced the second half of the column. + { + auto const sliced_strs = cudf::slice(strs, {2, 5})[0]; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected = STR_COL{"xyzéééxyzééé", "áááááá", "íííí"}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } +} + +TEST_F(RepeatJoinStringTest, StringsColumnWithNulls) +{ + auto const strs = STR_COL{{"0a0b0c", + "" /*NULL*/, + "abcxyz", + "" /*NULL*/, + "xyzééé", + "" /*NULL*/, + "ááá", + "íí", + "", + "Hello World"}, + null_at({1, 3, 5})}; + + { + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 2); + auto const expected = STR_COL{{"0a0b0c0a0b0c", + "" /*NULL*/, + "abcxyzabcxyz", + "" /*NULL*/, + "xyzéééxyzééé", + "" /*NULL*/, + "áááááá", + "íííí", + "", + "Hello WorldHello World"}, + null_at({1, 3, 5})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Repeat once. + { + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(strs, *results, print_all); + } + + // Non-positive repeat times. + { + auto const expected = STR_COL{ + {"", "" /*NULL*/, "", "" /*NULL*/, "", "" /*NULL*/, "", "", "", ""}, null_at({1, 3, 5})}; + + auto results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), 0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + + results = cudf::strings::repeat_strings(cudf::strings_column_view(strs), -100); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Sliced the first half of the column. + { + auto const sliced_strs = cudf::slice(strs, {0, 3})[0]; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected = STR_COL{{"0a0b0c0a0b0c", "" /*NULL*/, "abcxyzabcxyz"}, null_at(1)}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Sliced the middle of the column. + { + auto const sliced_strs = cudf::slice(strs, {2, 7})[0]; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected = STR_COL{ + {"abcxyzabcxyz", "" /*NULL*/, "xyzéééxyzééé", "" /*NULL*/, "áááááá"}, null_at({1, 3})}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *results, print_all); + } + + // Sliced the second half of the column. + { + auto const sliced_strs = cudf::slice(strs, {6, 10})[0]; + auto const results = cudf::strings::repeat_strings(cudf::strings_column_view(sliced_strs), 2); + auto const expected = STR_COL{"áááááá", "íííí", "", "Hello WorldHello World"}; + + // The results strings column may have a bitmask with all valid values. + CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *results, print_all); + } +}