From cb8b1ef2e0623ddf32ea01dc9c24d07956ec686f Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 3 Feb 2021 13:53:45 -0500 Subject: [PATCH 1/8] Add UTF-8 chars to create_random_column utility --- cpp/benchmarks/CMakeLists.txt | 6 ++-- .../common/generate_benchmark_input.cpp | 29 +++++++++++++------ 2 files changed, 23 insertions(+), 12 deletions(-) diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 073f0d62c0a..29e7dd57435 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -301,9 +301,9 @@ set(SUBWORD_TOKENIZER_BENCH_SRC ConfigureBench(SUBWORD_TOKENIZER_BENCH "${SUBWORD_TOKENIZER_BENCH_SRC}") ################################################################################################### -# - convert to string benchmark ------------------------------------------------------------------- +# - strings benchmark ------------------------------------------------------------------- -set(DURATION_TO_STRING_BENCH_SRC +set(STRINGS_BENCH_SRC "${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp") -ConfigureBench(DURATION_TO_STRING_BENCH "${DURATION_TO_STRING_BENCH_SRC}") +ConfigureBench(STRINGS_BENCH "${STRINGS_BENCH_SRC}") diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index 6200d9a9368..0d5c713afb8 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -347,14 +347,24 @@ void copy_string(cudf::size_type src_idx, template void append_string(Char_gen& char_gen, bool valid, uint32_t length, string_column_data& column_data) { - auto const idx = column_data.offsets.size() - 1; - column_data.offsets.push_back(column_data.offsets.back() + length); - std::generate_n(std::back_inserter(column_data.chars), - column_data.offsets[idx + 1] - column_data.offsets[idx], - [&]() { return char_gen(); }); - - // TODO: use empty string for invalid fields? - if (!valid) { cudf::clear_bit_unsafe(column_data.null_mask.data(), idx); } + if (!valid) { + auto const idx = column_data.offsets.size() - 1; + cudf::clear_bit_unsafe(column_data.null_mask.data(), idx); + // duplicate the offset value to indicate an empty row + column_data.offsets.push_back(column_data.offsets.back()); + return; + } + std::vector chars; + std::generate_n(std::back_inserter(chars), length, [&]() mutable { + auto ch = char_gen(); + if (ch < '\x7F') return static_cast(ch); + // x7F is at the top edge of ASCII; + // the next set of characters are assigned two bytes + chars.push_back('\xC4'); + return static_cast(ch + 1); + }); + column_data.chars.insert(column_data.chars.end(), chars.begin(), chars.end()); + column_data.offsets.push_back(column_data.offsets.back() + chars.size()); } /** @@ -371,7 +381,8 @@ std::unique_ptr create_random_column(data_profi std::mt19937& engine, cudf::size_type num_rows) { - auto char_dist = [&engine, dist = std::uniform_int_distribution{'!', '~'}]() mutable { + auto char_dist = [&engine, // range 32-127 is ASCII; 127-136 will be multi-byte UTF-8 + dist = std::uniform_int_distribution{32, 137}]() mutable { return dist(engine); }; auto len_dist = From c3859763cd4d7d1c212cf95d48f38f2e16031f5a Mon Sep 17 00:00:00 2001 From: davidwendt Date: Wed, 3 Feb 2021 16:05:26 -0500 Subject: [PATCH 2/8] update column_data.chars directly in append_string --- cpp/benchmarks/common/generate_benchmark_input.cpp | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index 0d5c713afb8..97b1394bdc1 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -354,17 +354,15 @@ void append_string(Char_gen& char_gen, bool valid, uint32_t length, string_colum column_data.offsets.push_back(column_data.offsets.back()); return; } - std::vector chars; - std::generate_n(std::back_inserter(chars), length, [&]() mutable { + std::generate_n(std::back_inserter(column_data.chars), length, [&]() mutable { auto ch = char_gen(); if (ch < '\x7F') return static_cast(ch); // x7F is at the top edge of ASCII; // the next set of characters are assigned two bytes - chars.push_back('\xC4'); + column_data.chars.push_back('\xC4'); return static_cast(ch + 1); }); - column_data.chars.insert(column_data.chars.end(), chars.begin(), chars.end()); - column_data.offsets.push_back(column_data.offsets.back() + chars.size()); + column_data.offsets.push_back(column_data.chars.size()); } /** From 0e38854704da889f976826b48acb2c8b957cf8c6 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Thu, 4 Feb 2021 14:10:09 -0500 Subject: [PATCH 3/8] added gbenchmark for strings to_lower --- cpp/benchmarks/CMakeLists.txt | 1 + cpp/benchmarks/string/case_benchmark.cpp | 51 ++++++++++++++++++++++++ 2 files changed, 52 insertions(+) create mode 100644 cpp/benchmarks/string/case_benchmark.cpp diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index 29e7dd57435..a5c213a419b 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -304,6 +304,7 @@ ConfigureBench(SUBWORD_TOKENIZER_BENCH "${SUBWORD_TOKENIZER_BENCH_SRC}") # - strings benchmark ------------------------------------------------------------------- set(STRINGS_BENCH_SRC + "${CMAKE_CURRENT_SOURCE_DIR}/string/case_benchmark.cpp" "${CMAKE_CURRENT_SOURCE_DIR}/string/convert_durations_benchmark.cpp") ConfigureBench(STRINGS_BENCH "${STRINGS_BENCH_SRC}") diff --git a/cpp/benchmarks/string/case_benchmark.cpp b/cpp/benchmarks/string/case_benchmark.cpp new file mode 100644 index 00000000000..5c6d89755d4 --- /dev/null +++ b/cpp/benchmarks/string/case_benchmark.cpp @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2021, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include + +#include +#include + +class StringCase : public cudf::benchmark { +}; + +static void BM_case(benchmark::State& state) +{ + cudf::size_type const n_rows{(cudf::size_type)state.range(0)}; + auto const table = create_random_table({cudf::type_id::STRING}, 1, row_count{n_rows}); + cudf::strings_column_view input(table->view().column(0)); + + for (auto _ : state) { + cuda_event_timer raii(state, true, 0); + cudf::strings::to_lower(input); + } + + state.SetBytesProcessed(state.iterations() * input.chars_size()); +} + +#define SORT_BENCHMARK_DEFINE(name) \ + BENCHMARK_DEFINE_F(StringCase, name) \ + (::benchmark::State & st) { BM_case(st); } \ + BENCHMARK_REGISTER_F(StringCase, name) \ + ->RangeMultiplier(8) \ + ->Ranges({{1 << 12, 1 << 24}}) \ + ->UseManualTime() \ + ->Unit(benchmark::kMillisecond); + +SORT_BENCHMARK_DEFINE(strings) From 1238890ee8f3f52762d802d373c8eb7d7e49f198 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Thu, 4 Feb 2021 14:11:22 -0500 Subject: [PATCH 4/8] refactored upper/lower strings code to use make_strings_children utility --- cpp/src/strings/case.cu | 124 +++++++++-------------- cpp/src/strings/char_types/char_cases.cu | 6 +- cpp/src/strings/char_types/char_cases.h | 6 +- 3 files changed, 52 insertions(+), 84 deletions(-) diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index ed689edc80e..7d46302d9ce 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -19,6 +19,7 @@ #include #include #include +#include #include #include #include @@ -37,30 +38,18 @@ namespace cudf { namespace strings { namespace detail { namespace { -/** - * @brief Used as template parameter to divide size calculation from - * the actual string operation within a function. - * - * Useful when most of the logic is identical for both passes. - */ -enum TwoPass { - SizeOnly = 0, ///< calculate the size only - ExecuteOp ///< run the string operation -}; /** * @brief Per string logic for case conversion functions. * - * @tparam Pass Determines if size calculation or output write is begin performed. */ -template struct upper_lower_fn { const column_device_view d_column; character_flags_table_type case_flag; // flag to check with on each character const character_flags_table_type* d_flags; const character_cases_table_type* d_case_table; const special_case_mapping* d_special_case_mapping; - const int32_t* d_offsets{}; + int32_t* d_offsets{}; char* d_chars{}; __device__ special_case_mapping get_special_case_mapping(uint32_t code_point) @@ -70,7 +59,7 @@ struct upper_lower_fn { // compute-size / copy the bytes representing the special case mapping for this codepoint __device__ int32_t handle_special_case_bytes(uint32_t code_point, - char*& d_buffer, + char* d_buffer, detail::character_flags_table_type flag) { special_case_mapping m = get_special_case_mapping(code_point); @@ -79,25 +68,28 @@ struct upper_lower_fn { auto const count = IS_LOWER(flag) ? m.num_upper_chars : m.num_lower_chars; auto const* chars = IS_LOWER(flag) ? m.upper : m.lower; for (uint16_t idx = 0; idx < count; idx++) { - if (Pass == SizeOnly) { + if (!d_buffer) { bytes += detail::bytes_in_char_utf8(detail::codepoint_to_utf8(chars[idx])); } else { bytes += detail::from_char_utf8(detail::codepoint_to_utf8(chars[idx]), d_buffer + bytes); } } - if (d_buffer != nullptr) { d_buffer += bytes; } + // if (d_buffer != nullptr) { d_buffer += bytes; } return bytes; } - __device__ int32_t operator()(size_type idx) + __device__ void operator()(size_type idx) { - if (d_column.is_null(idx)) return 0; // null string - string_view d_str = d_column.template element(idx); - int32_t bytes = 0; - char* d_buffer = nullptr; - if (Pass == ExecuteOp) d_buffer = d_chars + d_offsets[idx]; + if (d_column.is_null(idx)) { + if (!d_chars) d_offsets[idx] = 0; + return; + } + auto const d_str = d_column.template element(idx); + int32_t bytes = 0; + char* d_buffer = d_chars ? d_chars + d_offsets[idx] : nullptr; for (auto itr = d_str.begin(); itr != d_str.end(); ++itr) { - uint32_t code_point = detail::utf8_to_codepoint(*itr); + uint32_t code_point = detail::utf8_to_codepoint(*itr); + detail::character_flags_table_type flag = code_point <= 0x00FFFF ? d_flags[code_point] : 0; // we apply special mapping in two cases: @@ -105,21 +97,19 @@ struct upper_lower_fn { // - cased characters with the special mapping flag, when matching the input case_flag // if (IS_SPECIAL(flag) && ((flag & case_flag) || !IS_UPPER_OR_LOWER(flag))) { - bytes += handle_special_case_bytes(code_point, d_buffer, case_flag); - } else if (flag & case_flag) { - if (Pass == SizeOnly) - bytes += detail::bytes_in_char_utf8(detail::codepoint_to_utf8(d_case_table[code_point])); - else - d_buffer += - detail::from_char_utf8(detail::codepoint_to_utf8(d_case_table[code_point]), d_buffer); + auto const new_bytes = handle_special_case_bytes(code_point, d_buffer, case_flag); + bytes += new_bytes; + if (d_buffer) d_buffer += new_bytes; } else { - if (Pass == SizeOnly) - bytes += detail::bytes_in_char_utf8(*itr); + char_utf8 new_char = + (flag & case_flag) ? detail::codepoint_to_utf8(d_case_table[code_point]) : *itr; + if (!d_buffer) + bytes += detail::bytes_in_char_utf8(new_char); else - d_buffer += detail::from_char_utf8(*itr, d_buffer); + d_buffer += detail::from_char_utf8(new_char, d_buffer); } } - return bytes; + if (!d_buffer) d_offsets[idx] = bytes; } }; @@ -138,49 +128,27 @@ std::unique_ptr convert_case(strings_column_view const& strings, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { - auto strings_count = strings.size(); - if (strings_count == 0) return detail::make_empty_strings_column(stream, mr); - - auto strings_column = column_device_view::create(strings.parent(), stream); - auto d_column = *strings_column; - size_type null_count = strings.null_count(); - - // copy null mask - rmm::device_buffer null_mask = cudf::detail::copy_bitmask(strings.parent(), stream, mr); - // get the lookup tables used for case conversion - auto d_flags = get_character_flags_table(); - - auto d_case_table = get_character_cases_table(); - auto d_special_case_mapping = get_special_case_mapping_table(); - - // build offsets column -- calculate the size of each output string - auto offsets_transformer_itr = thrust::make_transform_iterator( - thrust::make_counting_iterator(0), - upper_lower_fn{d_column, case_flag, d_flags, d_case_table, d_special_case_mapping}); - auto offsets_column = detail::make_offsets_child_column( - offsets_transformer_itr, offsets_transformer_itr + strings_count, stream, mr); - auto offsets_view = offsets_column->view(); - auto d_new_offsets = offsets_view.data(); - - // build the chars column -- convert characters based on case_flag parameter - size_type bytes = thrust::device_pointer_cast(d_new_offsets)[strings_count]; - auto chars_column = - strings::detail::create_chars_child_column(strings_count, null_count, bytes, stream, mr); - auto chars_view = chars_column->mutable_view(); - auto d_chars = chars_view.data(); - - thrust::for_each_n( - rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - strings_count, - upper_lower_fn{ - d_column, case_flag, d_flags, d_case_table, d_special_case_mapping, d_new_offsets, d_chars}); - - return make_strings_column(strings_count, - std::move(offsets_column), - std::move(chars_column), - null_count, - std::move(null_mask), + if (strings.is_empty()) return detail::make_empty_strings_column(stream, mr); + + auto strings_column = column_device_view::create(strings.parent(), stream); + auto d_column = *strings_column; + + // build functor with lookup tables used for case conversion + upper_lower_fn functor{d_column, + case_flag, + get_character_flags_table(), + get_character_cases_table(), + get_special_case_mapping_table()}; + + // this utility calls the functor to build the offsets and chars columns + auto children = cudf::strings::detail::make_strings_children( + functor, strings.size(), strings.null_count(), stream, mr); + + return make_strings_column(strings.size(), + std::move(children.first), + std::move(children.second), + strings.null_count(), + cudf::detail::copy_bitmask(strings.parent(), stream, mr), stream, mr); } diff --git a/cpp/src/strings/char_types/char_cases.cu b/cpp/src/strings/char_types/char_cases.cu index b104c00eeab..1021d5768c1 100644 --- a/cpp/src/strings/char_types/char_cases.cu +++ b/cpp/src/strings/char_types/char_cases.cu @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020, NVIDIA CORPORATION. + * Copyright (c) 2020-2021, NVIDIA CORPORATION. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -188,9 +188,9 @@ void generate_special_mapping_hash_table() printf( "// the special case mapping table is a perfect hash table with no collisions, allowing us\n" "// to 'hash' by simply modding by the incoming codepoint\n" - "inline __device__ uint16_t get_special_case_hash_index(uint32_t code_point){\n" + "constexpr uint16_t get_special_case_hash_index(uint32_t code_point){\n" " constexpr uint16_t special_case_prime = %d;\n" - " return code_point %% special_case_prime;" + " return static_cast(code_point %% special_case_prime);" "\n}\n", hash_prime); } diff --git a/cpp/src/strings/char_types/char_cases.h b/cpp/src/strings/char_types/char_cases.h index 7fd0e586db9..88d7861a72b 100644 --- a/cpp/src/strings/char_types/char_cases.h +++ b/cpp/src/strings/char_types/char_cases.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020, NVIDIA CORPORATION. All rights reserved. + * Copyright (c) 2019-2021, NVIDIA CORPORATION. All rights reserved. * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. @@ -5236,10 +5236,10 @@ constexpr special_case_mapping g_special_case_mappings[] = { }; // the special case mapping table is a perfect hash table with no collisions, allowing us // to 'hash' by simply modding by the incoming codepoint -inline __device__ uint16_t get_special_case_hash_index(uint32_t code_point) +constexpr uint16_t get_special_case_hash_index(uint32_t code_point) { constexpr uint16_t special_case_prime = 499; - return code_point % special_case_prime; + return static_cast(code_point % special_case_prime); } } // namespace detail From 3a0b9d58a41aa25121aca99051d835cc713aa490 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Thu, 4 Feb 2021 18:33:18 -0500 Subject: [PATCH 5/8] convert if-stmt to ternary --- cpp/src/strings/case.cu | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu index 7d46302d9ce..9f3a1caba8a 100644 --- a/cpp/src/strings/case.cu +++ b/cpp/src/strings/case.cu @@ -68,13 +68,10 @@ struct upper_lower_fn { auto const count = IS_LOWER(flag) ? m.num_upper_chars : m.num_lower_chars; auto const* chars = IS_LOWER(flag) ? m.upper : m.lower; for (uint16_t idx = 0; idx < count; idx++) { - if (!d_buffer) { - bytes += detail::bytes_in_char_utf8(detail::codepoint_to_utf8(chars[idx])); - } else { - bytes += detail::from_char_utf8(detail::codepoint_to_utf8(chars[idx]), d_buffer + bytes); - } + bytes += d_buffer + ? detail::from_char_utf8(detail::codepoint_to_utf8(chars[idx]), d_buffer + bytes) + : detail::bytes_in_char_utf8(detail::codepoint_to_utf8(chars[idx])); } - // if (d_buffer != nullptr) { d_buffer += bytes; } return bytes; } From 7df4c9571a285c565d90d58d4f67c2d8501ae7fc Mon Sep 17 00:00:00 2001 From: davidwendt Date: Thu, 4 Feb 2021 20:12:30 -0500 Subject: [PATCH 6/8] add const to variable declaration --- cpp/benchmarks/common/generate_benchmark_input.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index 97b1394bdc1..6d8bf0bec8a 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -355,7 +355,7 @@ void append_string(Char_gen& char_gen, bool valid, uint32_t length, string_colum return; } std::generate_n(std::back_inserter(column_data.chars), length, [&]() mutable { - auto ch = char_gen(); + auto const ch = char_gen(); if (ch < '\x7F') return static_cast(ch); // x7F is at the top edge of ASCII; // the next set of characters are assigned two bytes From 301b9067812c57a5450d97fd29c22e81df6b1fd2 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Fri, 5 Feb 2021 11:32:49 -0500 Subject: [PATCH 7/8] change generate_n to for-loop --- cpp/benchmarks/common/generate_benchmark_input.cpp | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/cpp/benchmarks/common/generate_benchmark_input.cpp b/cpp/benchmarks/common/generate_benchmark_input.cpp index 6d8bf0bec8a..f20d2cab725 100644 --- a/cpp/benchmarks/common/generate_benchmark_input.cpp +++ b/cpp/benchmarks/common/generate_benchmark_input.cpp @@ -354,14 +354,12 @@ void append_string(Char_gen& char_gen, bool valid, uint32_t length, string_colum column_data.offsets.push_back(column_data.offsets.back()); return; } - std::generate_n(std::back_inserter(column_data.chars), length, [&]() mutable { + for (uint32_t idx = 0; idx < length; ++idx) { auto const ch = char_gen(); - if (ch < '\x7F') return static_cast(ch); - // x7F is at the top edge of ASCII; - // the next set of characters are assigned two bytes - column_data.chars.push_back('\xC4'); - return static_cast(ch + 1); - }); + if (ch >= '\x7F') // x7F is at the top edge of ASCII + column_data.chars.push_back('\xC4'); // these characters are assigned two bytes + column_data.chars.push_back(static_cast(ch + (ch >= '\x7F'))); + } column_data.offsets.push_back(column_data.chars.size()); } From 10bf5f3c047d751b51b82ecf6261dd79613b60c5 Mon Sep 17 00:00:00 2001 From: davidwendt Date: Fri, 5 Feb 2021 12:24:36 -0500 Subject: [PATCH 8/8] change gbenchmark output name to to_lower --- cpp/benchmarks/string/case_benchmark.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/cpp/benchmarks/string/case_benchmark.cpp b/cpp/benchmarks/string/case_benchmark.cpp index 5c6d89755d4..9c1c81da22a 100644 --- a/cpp/benchmarks/string/case_benchmark.cpp +++ b/cpp/benchmarks/string/case_benchmark.cpp @@ -48,4 +48,4 @@ static void BM_case(benchmark::State& state) ->UseManualTime() \ ->Unit(benchmark::kMillisecond); -SORT_BENCHMARK_DEFINE(strings) +SORT_BENCHMARK_DEFINE(to_lower)