From ab5f52d1cfd640cdaeaf5369510ff7974f729d5d Mon Sep 17 00:00:00 2001 From: Nghia Truong Date: Wed, 11 Jan 2023 13:16:43 -0800 Subject: [PATCH] Implement a simplified version of `from_json` (#844) * Add `MapUtils.java` * Add clang-format style * Fix comment * Add empty files * Fix compile issue and update clang-format * Add Java test * Concatenate the input json strings * Misc * Misc * Print debug * Update Java test * Add more test * Implement several more computation * Add comments * Implement node-to-token-index map * Compute node range from node indices, not token indices * Extract node ranges for key-value of non-nested types * Add more pairs to node section * Get node ranges for nested nodes * Extract json key-value pairs * Extract parent node ids of keys * Compute offsets for the output lists * Fix offsets computation Signed-off-by: Nghia Truong * Print debug for the output Signed-off-by: Nghia Truong * More efficient substring operation Signed-off-by: Nghia Truong * Update Java test Signed-off-by: Nghia Truong * Remove parameter Signed-off-by: Nghia Truong * Rewrite docs Signed-off-by: Nghia Truong * Rewrite for easier benchmark Signed-off-by: Nghia Truong * Extract out functions Signed-off-by: Nghia Truong * Refactor and cleanup Signed-off-by: Nghia Truong * Handle empty and nulls input rows Signed-off-by: Nghia Truong * Update Java test Signed-off-by: Nghia Truong * Cleanup headers Signed-off-by: Nghia Truong * Implement UTF-8 support Signed-off-by: Nghia Truong * Add Java test Signed-off-by: Nghia Truong * Fix error Signed-off-by: Nghia Truong * Move header into .cu file Signed-off-by: Nghia Truong * Update copyright headers Signed-off-by: Nghia Truong * Update function name Signed-off-by: Nghia Truong * Add `assert` Signed-off-by: Nghia Truong * Remove wrong comment Signed-off-by: Nghia Truong * Extract debug code into a separate header Signed-off-by: Nghia Truong * Simplify `output_size` computation Signed-off-by: Nghia Truong * Fix typo Signed-off-by: Nghia Truong * Cleanup unused variable Signed-off-by: Nghia Truong * Fix a bug Signed-off-by: Nghia Truong * Rename variable Signed-off-by: Nghia Truong * Print debug input when error Signed-off-by: Nghia Truong * Change the error message Signed-off-by: Nghia Truong * Optimize error report Signed-off-by: Nghia Truong * Change comment Signed-off-by: Nghia Truong Signed-off-by: Nghia Truong --- src/main/cpp/.clang-format | 204 ++++++ src/main/cpp/CMakeLists.txt | 10 +- src/main/cpp/src/MapUtilsJni.cpp | 35 + src/main/cpp/src/map_utils.cu | 635 ++++++++++++++++++ src/main/cpp/src/map_utils.hpp | 31 + src/main/cpp/src/map_utils_debug.cuh | 166 +++++ .../com/nvidia/spark/rapids/jni/MapUtils.java | 55 ++ .../nvidia/spark/rapids/jni/MapUtilsTest.java | 85 +++ 8 files changed, 1217 insertions(+), 4 deletions(-) create mode 100644 src/main/cpp/.clang-format create mode 100644 src/main/cpp/src/MapUtilsJni.cpp create mode 100644 src/main/cpp/src/map_utils.cu create mode 100644 src/main/cpp/src/map_utils.hpp create mode 100644 src/main/cpp/src/map_utils_debug.cuh create mode 100644 src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java create mode 100644 src/test/java/com/nvidia/spark/rapids/jni/MapUtilsTest.java diff --git a/src/main/cpp/.clang-format b/src/main/cpp/.clang-format new file mode 100644 index 00000000000..e0866533a36 --- /dev/null +++ b/src/main/cpp/.clang-format @@ -0,0 +1,204 @@ +--- +# Reference: https://clang.llvm.org/docs/ClangFormatStyleOptions.html +Language: Cpp +# BasedOnStyle: LLVM +# no indentation (-2 from indent, which is 2) +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +# int aaaa = 12; +# int b = 23; +# int ccc = 23; +# leaving OFF +AlignConsecutiveAssignments: false +# int aaaa = 12; +# float b = 23; +# std::string ccc = 23; +# leaving OFF +AlignConsecutiveDeclarations: false +##define A \ +# int aaaa; \ +# int b; \ +# int dddddddddd; +# leaving ON +AlignEscapedNewlines: Right +# int aaa = bbbbbbbbbbbbbbb + +# ccccccccccccccc; +# leaving ON +AlignOperands: true +# true: false: +# int a; // My comment a vs. int a; // My comment a +# int b = 2; // comment b int b = 2; // comment about b +# leaving ON +AlignTrailingComments: true +# squeezes a long declaration's arguments to the next line: +#true: +#void myFunction( +# int a, int b, int c, int d, int e); +# +#false: +#void myFunction(int a, +# int b, +# int c, +# int d, +# int e); +# leaving ON +AllowAllParametersOfDeclarationOnNextLine: true +# changed to ON, as we use short blocks on same lines +AllowShortBlocksOnASingleLine: true +# set this to ON, we use this in a few places +AllowShortCaseLabelsOnASingleLine: true +# set this to ON +AllowShortFunctionsOnASingleLine: Inline +AllowShortIfStatementsOnASingleLine: false +AllowShortLoopsOnASingleLine: false +# Deprecated option. +# PenaltyReturnTypeOnItsOwnLine applies, as we set this to None, +# where it tries to break after the return type automatically +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine + +# if all the arguments for a function don't fit in a single line, +# with a value of "false", it'll split each argument into different lines +BinPackArguments: true +BinPackParameters: true + +# if this is set to Custom, the BraceWrapping flags apply +BreakBeforeBraces: Custom +BraceWrapping: + AfterClass: false + AfterControlStatement: false + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + IndentBraces: false + SplitEmptyFunction: false + SplitEmptyRecord: false + SplitEmptyNamespace: false + +# will break after operators when a line is too long +BreakBeforeBinaryOperators: None +# not in docs.. so that's nice +BreakBeforeInheritanceComma: false +# This will break inheritance list and align on colon, +# it also places each inherited class in a different line. +# Leaving ON +BreakInheritanceList: BeforeColon + +# +#true: +#veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongDescription +# ? firstValue +# : SecondValueVeryVeryVeryVeryLong; +# +#false: +#veryVeryVeryVeryVeryVeryVeryVeryVeryVeryVeryLongDescription ? +# firstValue : +# SecondValueVeryVeryVeryVeryLong; +BreakBeforeTernaryOperators: false + +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: true +BreakStringLiterals: true +# So the line lengths in cudf are not following a limit, at the moment. +# Usually it's a long comment that makes the line length inconsistent. +# Command I used to find max line lengths (from cpp directory): +# find include src tests|grep "\." |xargs -I{} bash -c "awk '{print length}' {} | sort -rn | head -1"|sort -n +# I picked 100, as it seemed somewhere around median +ColumnLimit: 100 +# TODO: not aware of any of these at this time +CommentPragmas: '^ IWYU pragma:' +# So it doesn't put subsequent namespaces in the same line +CompactNamespaces: false +ConstructorInitializerAllOnOneLineOrOnePerLine: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +# TODO: adds spaces around the element list +# in initializer: vector x{ {}, ..., {} } +Cpp11BracedListStyle: true +DerivePointerAlignment: false +DisableFormat: false +ExperimentalAutoDetectBinPacking: false +# } // namespace a => useful +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IncludeBlocks: Regroup +IncludeCategories: + - Regex: '<[[:alnum:]]+>' + Priority: 0 + - Regex: '<[[:alnum:].]+>' + Priority: 1 + - Regex: '<.*>' + Priority: 2 + - Regex: '.*/.*' + Priority: 3 + - Regex: '.*' + Priority: 4 +# if a header matches this in an include group, it will be moved up to the +# top of the group. +IncludeIsMainRegex: '(Test)?$' +IndentCaseLabels: true +IndentPPDirectives: None +IndentWidth: 2 +IndentWrappedFunctionNames: false +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true + +# Penalties: leaving unchanged for now +# https://stackoverflow.com/questions/26635370/in-clang-format-what-do-the-penalties-do +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +# As currently set, we don't see return types being +# left on their own line, leaving at 60 +PenaltyReturnTypeOnItsOwnLine: 60 + +# char* foo vs char *foo, picking Right aligned +PointerAlignment: Right +ReflowComments: true +# leaving ON, but this could be something to turn OFF +SortIncludes: true +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInParentheses: false +SpacesInSquareBrackets: false +Standard: Cpp11 +TabWidth: 8 +UseTab: Never +... diff --git a/src/main/cpp/CMakeLists.txt b/src/main/cpp/CMakeLists.txt index 2520080cb16..4c6096774e3 100644 --- a/src/main/cpp/CMakeLists.txt +++ b/src/main/cpp/CMakeLists.txt @@ -140,15 +140,17 @@ set(CUDFJNI_INCLUDE_DIRS add_library( spark_rapids_jni SHARED - src/RowConversionJni.cpp src/CastStringJni.cpp + src/DecimalUtilsJni.cpp + src/MapUtilsJni.cpp src/NativeParquetJni.cpp + src/RowConversionJni.cpp + src/ZOrderJni.cpp src/cast_string.cu src/cast_string_to_float.cu - src/row_conversion.cu - src/DecimalUtilsJni.cpp src/decimal_utils.cu - src/ZOrderJni.cpp + src/map_utils.cu + src/row_conversion.cu src/zorder.cu ) diff --git a/src/main/cpp/src/MapUtilsJni.cpp b/src/main/cpp/src/MapUtilsJni.cpp new file mode 100644 index 00000000000..fbbcdd889ff --- /dev/null +++ b/src/main/cpp/src/MapUtilsJni.cpp @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include + +#include "map_utils.hpp" + +extern "C" { + +JNIEXPORT jlong JNICALL Java_com_nvidia_spark_rapids_jni_MapUtils_extractRawMapFromJsonString( + JNIEnv *env, jclass, jlong input_handle) { + JNI_NULL_CHECK(env, input_handle, "json_column_handle is null", 0); + + try { + cudf::jni::auto_set_device(env); + auto const input = reinterpret_cast(input_handle); + return cudf::jni::ptr_as_jlong(spark_rapids_jni::from_json(*input).release()); + } + CATCH_STD(env, 0); +} +} diff --git a/src/main/cpp/src/map_utils.cu b/src/main/cpp/src/map_utils.cu new file mode 100644 index 00000000000..590fd04cc41 --- /dev/null +++ b/src/main/cpp/src/map_utils.cu @@ -0,0 +1,635 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "map_utils_debug.cuh" + +// +#include + +// +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// +#include +#include + +// +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +// +#include + +namespace spark_rapids_jni { + +using namespace cudf::io::json; + +namespace { + +// Unify the input json strings by: +// 1. Append one comma character (',') to the end of each input string, except the last one. +// 2. Concatenate all input strings into one string. +// 3. Add a pair of bracket characters ('[' and ']') to the beginning and the end of the output. +rmm::device_uvector unify_json_strings(cudf::column_view const &input, + rmm::cuda_stream_view stream) { + if (input.is_empty()) { + return cudf::detail::make_device_uvector_async(std::vector{'[', ']'}, stream); + } + + auto const d_strings = cudf::column_device_view::create(input, stream); + auto const chars_size = input.child(cudf::strings_column_view::chars_column_index).size(); + auto const output_size = + 2l + // two extra bracket characters '[' and ']' + static_cast(chars_size) + + static_cast(input.size() - 1) + // append `,` character between input rows + static_cast(input.null_count()) * 2l; // replace null with "{}" + CUDF_EXPECTS(output_size <= static_cast(std::numeric_limits::max()), + "The input json column is too large and causes overflow."); + + auto const joined_input = cudf::strings::detail::join_strings( + cudf::strings_column_view{input}, + cudf::string_scalar(","), // append `,` character between the input rows + cudf::string_scalar("{}"), // replacement for null rows + stream, rmm::mr::get_current_device_resource()); + auto const joined_input_child = + joined_input->child(cudf::strings_column_view::chars_column_index); + auto const joined_input_size_bytes = joined_input_child.size(); + CUDF_EXPECTS(joined_input_size_bytes + 2 == output_size, "Incorrect output size computation."); + + // We want to concatenate 3 strings: "[" + joined_input + "]". + // For efficiency, let's use memcpy instead of `cudf::strings::detail::concatenate`. + auto output = rmm::device_uvector(joined_input_size_bytes + 2, stream); + CUDF_CUDA_TRY(cudaMemsetAsync(output.data(), static_cast('['), 1, stream.value())); + CUDF_CUDA_TRY(cudaMemcpyAsync(output.data() + 1, joined_input_child.view().data(), + joined_input_size_bytes, cudaMemcpyDefault, stream.value())); + CUDF_CUDA_TRY(cudaMemsetAsync(output.data() + joined_input_size_bytes + 1, static_cast(']'), + 1, stream.value())); + +#ifdef DEBUG_FROM_JSON + print_debug(output, "Processed json string", "", stream); +#endif + return output; +} + +// Check and throw exception if there is any parsing error. +void throw_if_error(rmm::device_uvector const &input_json, + rmm::device_uvector const &tokens, + rmm::device_uvector const &token_indices, + rmm::cuda_stream_view stream) { + auto const error_count = + thrust::count(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin); + + if (error_count > 0) { + auto const error_location = + thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin); + SymbolOffsetT error_index; + CUDF_CUDA_TRY(cudaMemcpyAsync( + &error_index, token_indices.data() + thrust::distance(tokens.begin(), error_location), + sizeof(SymbolOffsetT), cudaMemcpyDeviceToHost, stream.value())); + stream.synchronize(); + + constexpr auto extension = 100; + auto const begin_print_idx = std::max(error_index - extension, SymbolOffsetT{0}); + auto const end_print_idx = + std::min(error_index + extension, static_cast(input_json.size())); + auto const print_size = end_print_idx - begin_print_idx; + auto const h_input_json = cudf::detail::make_host_vector_sync( + cudf::device_span{input_json.data() + begin_print_idx, print_size}, stream); + std::cerr << "Substring of the input json with " + std::to_string(extension) + << " characters before+after the error location:\n"; + std::cerr << std::string(h_input_json.data(), h_input_json.size()) << std::endl; + + CUDF_FAIL("JSON Parser encountered an invalid format at location " + + std::to_string(error_index)); + } +} + +// Check if a token is a json node. +struct is_node { + __host__ __device__ bool operator()(PdaTokenT const token) const { + switch (token) { + case token_t::StructBegin: + case token_t::ListBegin: + case token_t::StringBegin: + case token_t::ValueBegin: + case token_t::FieldNameBegin: + case token_t::ErrorBegin: return true; + default: return false; + }; + } +}; + +// Compute the level of each token node. +// The top json node (top json object level) has level 0. +// Each row in the input column should have levels starting from 1. +// This is copied from cudf's `json_tree.cu`. +rmm::device_uvector compute_node_levels(int64_t num_nodes, + rmm::device_uvector const &tokens, + rmm::cuda_stream_view stream) { + auto token_levels = rmm::device_uvector(tokens.size(), stream); + + // Whether the token pops from the parent node stack. + auto const does_pop = [] __device__(PdaTokenT const token) -> bool { + switch (token) { + case token_t::StructMemberEnd: + case token_t::StructEnd: + case token_t::ListEnd: return true; + default: return false; + }; + }; + + // Whether the token pushes onto the parent node stack. + auto const does_push = [] __device__(PdaTokenT const token) -> bool { + switch (token) { + case token_t::FieldNameBegin: + case token_t::StructBegin: + case token_t::ListBegin: return true; + default: return false; + }; + }; + + auto const push_pop_it = thrust::make_transform_iterator( + tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> cudf::size_type { + return does_push(token) - does_pop(token); + }); + thrust::exclusive_scan(rmm::exec_policy(stream), push_pop_it, push_pop_it + tokens.size(), + token_levels.begin()); + + auto node_levels = rmm::device_uvector(num_nodes, stream); + auto const copy_end = + cudf::detail::copy_if_safe(token_levels.begin(), token_levels.end(), tokens.begin(), + node_levels.begin(), is_node{}, stream); + CUDF_EXPECTS(thrust::distance(node_levels.begin(), copy_end) == num_nodes, + "Node level count mismatch"); + +#ifdef DEBUG_FROM_JSON + print_debug(node_levels, "Node levels", ", ", stream); +#endif + return node_levels; +} + +// Compute the map from nodes to their indices in the list of all tokens. +rmm::device_uvector +compute_node_to_token_index_map(int64_t num_nodes, rmm::device_uvector const &tokens, + rmm::cuda_stream_view stream) { + auto node_token_ids = rmm::device_uvector(num_nodes, stream); + auto const node_id_it = thrust::counting_iterator(0); + auto const copy_end = + cudf::detail::copy_if_safe(node_id_it, node_id_it + tokens.size(), tokens.begin(), + node_token_ids.begin(), is_node{}, stream); + CUDF_EXPECTS(thrust::distance(node_token_ids.begin(), copy_end) == num_nodes, + "Invalid computation for node-to-token-index map"); + +#ifdef DEBUG_FROM_JSON + print_map_debug(node_token_ids, "Node-to-token-index map", stream); +#endif + return node_token_ids; +} + +// This is copied from cudf's `json_tree.cu`. +template +std::pair, rmm::device_uvector> +stable_sorted_key_order(rmm::device_uvector const &keys, rmm::cuda_stream_view stream) { + // Buffers used for storing intermediate results during sorting. + rmm::device_uvector keys_buffer1(keys.size(), stream); + rmm::device_uvector keys_buffer2(keys.size(), stream); + rmm::device_uvector order_buffer1(keys.size(), stream); + rmm::device_uvector order_buffer2(keys.size(), stream); + cub::DoubleBuffer keys_buffer(keys_buffer1.data(), keys_buffer2.data()); + cub::DoubleBuffer order_buffer(order_buffer1.data(), order_buffer2.data()); + + thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin()); + thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end()); + + size_t temp_storage_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, temp_storage_bytes, keys_buffer, order_buffer, + keys.size()); + rmm::device_buffer d_temp_storage(temp_storage_bytes, stream); + cub::DeviceRadixSort::SortPairs(d_temp_storage.data(), temp_storage_bytes, keys_buffer, + order_buffer, keys.size(), 0, sizeof(KeyType) * 8, + stream.value()); + + return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1) : + std::move(keys_buffer2), + order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1) : + std::move(order_buffer2)}; +} + +// This is copied from cudf's `json_tree.cu`. +void propagate_parent_to_siblings(rmm::device_uvector const &node_levels, + rmm::device_uvector &parent_node_ids, + rmm::cuda_stream_view stream) { + auto const [sorted_node_levels, sorted_order] = stable_sorted_key_order(node_levels, stream); + + // Instead of gather, using permutation_iterator, which is ~17% faster. + thrust::inclusive_scan_by_key( + rmm::exec_policy(stream), sorted_node_levels.begin(), sorted_node_levels.end(), + thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), + thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()), + thrust::equal_to{}, thrust::maximum{}); +} + +// This is copied from cudf's `json_tree.cu`. +rmm::device_uvector +compute_parent_node_ids(int64_t num_nodes, rmm::device_uvector const &tokens, + rmm::device_uvector const &node_token_ids, + rmm::cuda_stream_view stream) { + auto const first_childs_parent_token_id = [tokens = + tokens.begin()] __device__(auto i) -> NodeIndexT { + if (i <= 0) { + return -1; + } + if (tokens[i - 1] == token_t::StructBegin || tokens[i - 1] == token_t::ListBegin) { + return i - 1; + } else if (tokens[i - 1] == token_t::FieldNameEnd) { + return i - 2; + } else if (tokens[i - 1] == token_t::StructMemberBegin && + (tokens[i - 2] == token_t::StructBegin || tokens[i - 2] == token_t::ListBegin)) { + return i - 2; + } else { + return -1; + } + }; + + auto parent_node_ids = rmm::device_uvector(num_nodes, stream); + thrust::transform(rmm::exec_policy(stream), node_token_ids.begin(), node_token_ids.end(), + parent_node_ids.begin(), + [node_ids_gpu = node_token_ids.begin(), num_nodes, + first_childs_parent_token_id] __device__(NodeIndexT const tid) -> NodeIndexT { + auto const pid = first_childs_parent_token_id(tid); + return pid < 0 ? cudf::io::json::parent_node_sentinel : + thrust::lower_bound(thrust::seq, node_ids_gpu, + node_ids_gpu + num_nodes, pid) - + node_ids_gpu; + }); + + // Propagate parent node to siblings from first sibling - inplace. + auto const node_levels = compute_node_levels(num_nodes, tokens, stream); + propagate_parent_to_siblings(node_levels, parent_node_ids, stream); + +#ifdef DEBUG_FROM_JSON + print_debug(parent_node_ids, "Parent node ids", ", ", stream); +#endif + return parent_node_ids; +} + +constexpr int8_t key_sentinel{1}; +constexpr int8_t value_sentinel{2}; + +// Check for each node if it is a key or a value field. +rmm::device_uvector +check_key_or_value_nodes(rmm::device_uvector const &parent_node_ids, + rmm::cuda_stream_view stream) { + auto key_or_value = rmm::device_uvector(parent_node_ids.size(), stream); + auto const transform_it = thrust::counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), transform_it, transform_it + parent_node_ids.size(), + key_or_value.begin(), + [key_sentinel = key_sentinel, value_sentinel = value_sentinel, + parent_ids = parent_node_ids.begin()] __device__(auto const node_id) -> int8_t { + if (parent_ids[node_id] > 0) { + auto const grand_parent = parent_ids[parent_ids[node_id]]; + if (grand_parent == 0) { + return key_sentinel; + } else if (parent_ids[grand_parent] == 0) { + return value_sentinel; + } + } + + return 0; + }); + +#ifdef DEBUG_FROM_JSON + print_debug(key_or_value, "Nodes are key/value (1==key, 2==value)", ", ", stream); +#endif + return key_or_value; +} + +// Convert token indices to node ranges for each valid node. +struct node_ranges_fn { + cudf::device_span tokens; + cudf::device_span token_indices; + cudf::device_span node_token_ids; + cudf::device_span parent_node_ids; + cudf::device_span key_or_value; + + // Whether the extracted string values from json map will have the quote character. + static const bool include_quote_char{false}; + + __device__ thrust::pair operator()(cudf::size_type node_id) const { + [[maybe_unused]] auto const is_begin_of_section = [] __device__(PdaTokenT const token) { + switch (token) { + case token_t::StructBegin: + case token_t::ListBegin: + case token_t::StringBegin: + case token_t::ValueBegin: + case token_t::FieldNameBegin: return true; + default: return false; + }; + }; + + // The end-of-* partner token for a given beginning-of-* token + auto const end_of_partner = [] __device__(PdaTokenT const token) { + switch (token) { + case token_t::StructBegin: return token_t::StructEnd; + case token_t::ListBegin: return token_t::ListEnd; + case token_t::StringBegin: return token_t::StringEnd; + case token_t::ValueBegin: return token_t::ValueEnd; + case token_t::FieldNameBegin: return token_t::FieldNameEnd; + default: return token_t::ErrorBegin; + }; + }; + + // Encode a fixed value for nested node types (list+struct). + auto const nested_node_to_value = [] __device__(PdaTokenT const token) -> int32_t { + switch (token) { + case token_t::StructBegin: return 1; + case token_t::StructEnd: return -1; + case token_t::ListBegin: return 1 << 8; + case token_t::ListEnd: return -(1 << 8); + default: return 0; + }; + }; + + auto const get_token_index = [include_quote_char = include_quote_char] __device__( + PdaTokenT const token, SymbolOffsetT const token_index) { + constexpr SymbolOffsetT quote_char_size = 1; + switch (token) { + // Strip off quote char included for StringBegin + case token_t::StringBegin: return token_index + (include_quote_char ? 0 : quote_char_size); + // Strip off or Include trailing quote char for string values for StringEnd + case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0); + // Strip off quote char included for FieldNameBegin + case token_t::FieldNameBegin: return token_index + quote_char_size; + default: return token_index; + }; + }; + + if (key_or_value[node_id] != key_sentinel && key_or_value[node_id] != value_sentinel) { + return thrust::make_pair(0, 0); + } + + auto const token_idx = node_token_ids[node_id]; + auto const token = tokens[token_idx]; + cudf_assert(is_begin_of_section(token) && "Invalid node category."); + + // The section from the original JSON input that this token demarcates. + auto const range_begin = get_token_index(token, token_indices[token_idx]); + auto range_end = range_begin + 1; // non-leaf, non-field nodes ignore this value. + if ((token_idx + 1) < tokens.size() && end_of_partner(token) == tokens[token_idx + 1]) { + // Update the range_end for this pair of tokens + range_end = get_token_index(tokens[token_idx + 1], token_indices[token_idx + 1]); + } else { + auto nested_range_value = nested_node_to_value(token); // iterate until this is zero + auto end_idx = token_idx + 1; + while (end_idx < tokens.size()) { + nested_range_value += nested_node_to_value(tokens[end_idx]); + if (nested_range_value == 0) { + range_end = get_token_index(tokens[end_idx], token_indices[end_idx]) + 1; + break; + } + ++end_idx; + } + cudf_assert(nested_range_value == 0 && "Invalid range computation."); + cudf_assert((end_idx + 1 < tokens.size()) && "Invalid range computation."); + } + return thrust::make_pair(range_begin, range_end); + } +}; + +// Compute index range for each node. +// These ranges identify positions to extract nodes from the unified json string. +rmm::device_uvector> +compute_node_ranges(int64_t num_nodes, rmm::device_uvector const &tokens, + rmm::device_uvector const &token_indices, + rmm::device_uvector const &node_token_ids, + rmm::device_uvector const &parent_node_ids, + rmm::device_uvector const &key_or_value, rmm::cuda_stream_view stream) { + auto node_ranges = + rmm::device_uvector>(num_nodes, stream); + auto const transform_it = thrust::counting_iterator(0); + thrust::transform( + rmm::exec_policy(stream), transform_it, transform_it + num_nodes, node_ranges.begin(), + node_ranges_fn{tokens, token_indices, node_token_ids, parent_node_ids, key_or_value}); + +#ifdef DEBUG_FROM_JSON + print_pair_debug(node_ranges, "Node ranges", stream); +#endif + return node_ranges; +} + +// Function logic for substring API. +// This both calculates the output size and executes the substring. +// No bound check is performed, assuming that the substring bounds are all valid. +struct substring_fn { + cudf::device_span const d_string; + cudf::device_span const> const d_ranges; + + cudf::offset_type *d_offsets{}; + char *d_chars{}; + + __device__ void operator()(cudf::size_type const idx) { + auto const range = d_ranges[idx]; + auto const size = range.second - range.first; + if (d_chars) { + memcpy(d_chars + d_offsets[idx], d_string.data() + range.first, size); + } else { + d_offsets[idx] = size; + } + } +}; + +// Extract key-value string pairs from the input json string. +std::unique_ptr extract_keys_or_values( + bool extract_key, int64_t num_nodes, + rmm::device_uvector> const &node_ranges, + rmm::device_uvector const &key_or_value, + rmm::device_uvector const &unified_json_buff, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + auto const is_key = [key_or_value = key_or_value.begin()] __device__(auto const node_id) { + return key_or_value[node_id] == key_sentinel; + }; + + auto const is_value = [key_or_value = key_or_value.begin()] __device__(auto const node_id) { + return key_or_value[node_id] == value_sentinel; + }; + + auto extract_ranges = + rmm::device_uvector>(num_nodes, stream, mr); + auto const stencil_it = thrust::make_counting_iterator(0); + auto const range_end = + extract_key ? cudf::detail::copy_if_safe(node_ranges.begin(), node_ranges.end(), stencil_it, + extract_ranges.begin(), is_key, stream) : + cudf::detail::copy_if_safe(node_ranges.begin(), node_ranges.end(), stencil_it, + extract_ranges.begin(), is_value, stream); + auto const num_extract = thrust::distance(extract_ranges.begin(), range_end); + + auto children = cudf::strings::detail::make_strings_children( + substring_fn{unified_json_buff, extract_ranges}, num_extract, stream, mr); + return cudf::make_strings_column(num_extract, std::move(children.first), + std::move(children.second), 0, rmm::device_buffer{}); +} + +// Compute the offsets for the final lists of Struct. +rmm::device_uvector +compute_list_offsets(cudf::size_type n_lists, + rmm::device_uvector const &parent_node_ids, + rmm::device_uvector const &key_or_value, rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + // Count the number of children nodes for the json object nodes. + // These object nodes are given as one row of the input json strings column. + auto node_child_counts = rmm::device_uvector(parent_node_ids.size(), stream); + + // For the nodes having parent_id == 0 (they are json object given by one input row), set their + // child counts to zero. Otherwise, set child counts to `-1` (a sentinel number). + thrust::transform(rmm::exec_policy(stream), parent_node_ids.begin(), parent_node_ids.end(), + node_child_counts.begin(), [] __device__(auto const parent_id) -> NodeIndexT { + return parent_id == 0 ? 0 : std::numeric_limits::lowest(); + }); + + auto const is_key = [key_or_value = key_or_value.begin()] __device__(auto const node_id) { + return key_or_value[node_id] == key_sentinel; + }; + + // Count the number of keys for each json object using `atomicAdd`. + auto const transform_it = thrust::counting_iterator(0); + thrust::for_each(rmm::exec_policy(stream), transform_it, transform_it + parent_node_ids.size(), + [is_key, child_counts = node_child_counts.begin(), + parent_ids = parent_node_ids.begin()] __device__(auto const node_id) { + if (is_key(node_id)) { + auto const parent_id = parent_ids[node_id]; + atomicAdd(&child_counts[parent_id], 1); + } + }); +#ifdef DEBUG_FROM_JSON + print_debug(node_child_counts, "Nodes' child keys counts", ", ", stream); +#endif + + auto list_offsets = rmm::device_uvector(n_lists + 1, stream, mr); + auto const copy_end = cudf::detail::copy_if_safe( + node_child_counts.begin(), node_child_counts.end(), list_offsets.begin(), + [] __device__(auto const count) { return count >= 0; }, stream); + CUDF_EXPECTS(thrust::distance(list_offsets.begin(), copy_end) == static_cast(n_lists), + "Invalid list size computation."); +#ifdef DEBUG_FROM_JSON + print_debug(list_offsets, "Output list sizes (except the last one)", ", ", stream); +#endif + + thrust::exclusive_scan(rmm::exec_policy(stream), list_offsets.begin(), list_offsets.end(), + list_offsets.begin()); +#ifdef DEBUG_FROM_JSON + print_debug(list_offsets, "Output list offsets", ", ", stream); +#endif + return list_offsets; +} + +} // namespace + +std::unique_ptr from_json(cudf::column_view const &input, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource *mr) { + CUDF_EXPECTS(input.type().id() == cudf::type_id::STRING, "Invalid input format"); + + // Firstly, concatenate all the input json strings into one giant input json string. + // When testing/debugging, the output can be validated using + // https://jsonformatter.curiousconcept.com. + auto const unified_json_buff = unify_json_strings(input, stream); + + // Tokenize the input json strings. + static_assert(sizeof(SymbolT) == sizeof(char), + "Invalid internal data for nested json tokenizer."); + auto const [tokens, token_indices] = cudf::io::json::detail::get_token_stream( + cudf::device_span{unified_json_buff.data(), unified_json_buff.size()}, + cudf::io::json_reader_options{}, stream); + +#ifdef DEBUG_FROM_JSON + print_debug(tokens, "Tokens", ", ", stream); + print_debug(token_indices, "Token indices", ", ", stream); +#endif + + // Make sure there is no error during parsing. + throw_if_error(unified_json_buff, tokens, token_indices, stream); + + auto const num_nodes = + thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_node{}); + + // Compute the map from nodes to their indices in the list of all tokens. + auto const node_token_ids = compute_node_to_token_index_map(num_nodes, tokens, stream); + + // A map from each node to the index of its parent node. + auto const parent_node_ids = compute_parent_node_ids(num_nodes, tokens, node_token_ids, stream); + + // Check for each node if it is a map key or a map value to extract. + auto const key_or_value_node = check_key_or_value_nodes(parent_node_ids, stream); + + // Compute index range for each node. + // These ranges identify positions to extract nodes from the unified json string. + auto const node_ranges = compute_node_ranges(num_nodes, tokens, token_indices, node_token_ids, + parent_node_ids, key_or_value_node, stream); + + // + // From below are variables for returning output. + // + + auto extracted_keys = extract_keys_or_values(true /*key*/, num_nodes, node_ranges, + key_or_value_node, unified_json_buff, stream, mr); + auto extracted_values = extract_keys_or_values(false /*value*/, num_nodes, node_ranges, + key_or_value_node, unified_json_buff, stream, mr); + CUDF_EXPECTS(extracted_keys->size() == extracted_values->size(), + "Invalid key-value pair extraction."); + + // Compute the offsets of the final output lists column. + auto list_offsets = + compute_list_offsets(input.size(), parent_node_ids, key_or_value_node, stream, mr); + +#ifdef DEBUG_FROM_JSON + print_output_spark_map(list_offsets, extracted_keys, extracted_values, stream); +#endif + + auto const num_pairs = extracted_keys->size(); + std::vector> out_keys_vals; + out_keys_vals.emplace_back(std::move(extracted_keys)); + out_keys_vals.emplace_back(std::move(extracted_values)); + auto structs_col = cudf::make_structs_column(num_pairs, std::move(out_keys_vals), 0, + rmm::device_buffer{}, stream, mr); + + return cudf::make_lists_column( + input.size(), std::make_unique(std::move(list_offsets)), std::move(structs_col), + input.null_count(), cudf::detail::copy_bitmask(input, stream, mr), stream, mr); +} + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/map_utils.hpp b/src/main/cpp/src/map_utils.hpp new file mode 100644 index 00000000000..ddf66b07de0 --- /dev/null +++ b/src/main/cpp/src/map_utils.hpp @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include + +#include +#include +#include + +namespace spark_rapids_jni { + +std::unique_ptr +from_json(cudf::column_view const &input, rmm::cuda_stream_view stream = cudf::get_default_stream(), + rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()); + +} // namespace spark_rapids_jni diff --git a/src/main/cpp/src/map_utils_debug.cuh b/src/main/cpp/src/map_utils_debug.cuh new file mode 100644 index 00000000000..652fa846722 --- /dev/null +++ b/src/main/cpp/src/map_utils_debug.cuh @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +//#define DEBUG_FROM_JSON + +#ifdef DEBUG_FROM_JSON + +#include +#include +#include + +// +#include +#include + +// +#include + +namespace spark_rapids_jni { + +using namespace cudf::io::json; + +// Convert the token value into string name, for debugging purpose. +std::string token_to_string(PdaTokenT const token_type) { + switch (token_type) { + case token_t::StructBegin: return "StructBegin"; + case token_t::StructEnd: return "StructEnd"; + case token_t::ListBegin: return "ListBegin"; + case token_t::ListEnd: return "ListEnd"; + case token_t::StructMemberBegin: return "StructMemberBegin"; + case token_t::StructMemberEnd: return "StructMemberEnd"; + case token_t::FieldNameBegin: return "FieldNameBegin"; + case token_t::FieldNameEnd: return "FieldNameEnd"; + case token_t::StringBegin: return "StringBegin"; + case token_t::StringEnd: return "StringEnd"; + case token_t::ValueBegin: return "ValueBegin"; + case token_t::ValueEnd: return "ValueEnd"; + case token_t::ErrorBegin: return "ErrorBegin"; + default: return "Unknown"; + } +} + +// Print the content of the input device vector. +template +void print_debug(rmm::device_uvector const &input, std::string const &name, + std::string const &separator, rmm::cuda_stream_view stream) { + auto const h_input = cudf::detail::make_host_vector_sync( + cudf::device_span{input.data(), input.size()}, stream); + std::stringstream ss; + ss << name << ":\n"; + for (size_t i = 0; i < h_input.size(); ++i) { + ss << static_cast(h_input[i]); + if (separator.size() > 0 && i + 1 < h_input.size()) { + ss << separator; + } + } + std::cerr << ss.str() << std::endl; +} + +// Print the content of the input map given by a device vector. +template +void print_map_debug(rmm::device_uvector const &input, std::string const &name, + rmm::cuda_stream_view stream) { + auto const h_input = cudf::detail::make_host_vector_sync( + cudf::device_span{input.data(), input.size()}, stream); + std::stringstream ss; + ss << name << ":\n"; + for (size_t i = 0; i < h_input.size(); ++i) { + ss << i << " => " << static_cast(h_input[i]) << "\n"; + } + std::cerr << ss.str() << std::endl; +} + +// Print the content of the input pairs given by a device vector. +template +void print_pair_debug(rmm::device_uvector const &input, std::string const &name, + rmm::cuda_stream_view stream) { + auto const h_input = cudf::detail::make_host_vector_sync( + cudf::device_span{input.data(), input.size()}, stream); + std::stringstream ss; + ss << name << ":\n"; + for (size_t i = 0; i < h_input.size(); ++i) { + ss << "[ " << static_cast(h_input[i].first) << ", " << static_cast(h_input[i].second) + << " ]\n"; + } + std::cerr << ss.str() << std::endl; +} + +// Print the final output map data (Spark's MapType, i.e., List>). +void print_output_spark_map(rmm::device_uvector const &list_offsets, + std::unique_ptr const &extracted_keys, + std::unique_ptr const &extracted_values, + rmm::cuda_stream_view stream) { + auto const keys_child = extracted_keys->child(cudf::strings_column_view::chars_column_index); + auto const keys_offsets = extracted_keys->child(cudf::strings_column_view::offsets_column_index); + auto const values_child = extracted_values->child(cudf::strings_column_view::chars_column_index); + auto const values_offsets = + extracted_values->child(cudf::strings_column_view::offsets_column_index); + + auto const h_extracted_keys_child = cudf::detail::make_host_vector_sync( + cudf::device_span{keys_child.view().data(), + static_cast(keys_child.size())}, + stream); + auto const h_extracted_keys_offsets = cudf::detail::make_host_vector_sync( + cudf::device_span{keys_offsets.view().data(), + static_cast(keys_offsets.size())}, + stream); + + auto const h_extracted_values_child = cudf::detail::make_host_vector_sync( + cudf::device_span{values_child.view().data(), + static_cast(values_child.size())}, + stream); + auto const h_extracted_values_offsets = cudf::detail::make_host_vector_sync( + cudf::device_span{values_offsets.view().data(), + static_cast(values_offsets.size())}, + stream); + + auto const h_list_offsets = cudf::detail::make_host_vector_sync( + cudf::device_span{list_offsets.data(), list_offsets.size()}, stream); + CUDF_EXPECTS(h_list_offsets.back() == extracted_keys->size(), + "Invalid list offsets computation."); + + std::stringstream ss; + ss << "Extract keys-values:\n"; + + for (size_t i = 0; i + 1 < h_list_offsets.size(); ++i) { + ss << "List " << i << ": [" << h_list_offsets[i] << ", " << h_list_offsets[i + 1] << "]\n"; + for (cudf::size_type string_idx = h_list_offsets[i]; string_idx < h_list_offsets[i + 1]; + ++string_idx) { + { + auto const string_begin = h_extracted_keys_offsets[string_idx]; + auto const string_end = h_extracted_keys_offsets[string_idx + 1]; + auto const size = string_end - string_begin; + auto const ptr = &h_extracted_keys_child[string_begin]; + ss << "\t\"" << std::string(ptr, size) << "\" : "; + } + { + auto const string_begin = h_extracted_values_offsets[string_idx]; + auto const string_end = h_extracted_values_offsets[string_idx + 1]; + auto const size = string_end - string_begin; + auto const ptr = &h_extracted_values_child[string_begin]; + ss << "\"" << std::string(ptr, size) << "\"\n"; + } + } + } + std::cerr << ss.str() << std::endl; +} + +} // namespace spark_rapids_jni + +#endif // DEBUG_FROM_JSON diff --git a/src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java b/src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java new file mode 100644 index 00000000000..140455b4621 --- /dev/null +++ b/src/main/java/com/nvidia/spark/rapids/jni/MapUtils.java @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import ai.rapids.cudf.ColumnVector; +import ai.rapids.cudf.ColumnView; +import ai.rapids.cudf.DType; +import ai.rapids.cudf.NativeDepsLoader; + +public class MapUtils { + static { + NativeDepsLoader.loadNativeDeps(); + } + + + /** + * Extract key-value pairs for each output map from the given json strings. These key-value are + * copied directly as substrings of the input without any type conversion. + *

+ * Since there is not any validity check, the output of this function may be different from + * what generated by Spark's `from_json` function. Situations that can lead to + * different/incorrect outputs may include:
+ * - The value in the input json string is invalid, such as 'abc' value for an integer key.
+ * - The value string can be non-clean format for floating-point type, such as '1.00000'. + *

+ * The output of these situations should all be NULL or a value '1.0', respectively. However, this + * function will just simply copy the input value strings to the output. + * + * @param jsonColumn The input strings column in which each row specifies a json object. + * @return A map column (i.e., a column of type {@code List>}) in + * which the key-value pairs are extracted directly from the input json strings. + */ + public static ColumnVector extractRawMapFromJsonString(ColumnView jsonColumn) { + assert jsonColumn.getType().equals(DType.STRING) : "Input type must be String"; + return new ColumnVector(extractRawMapFromJsonString(jsonColumn.getNativeView())); + } + + + private static native long extractRawMapFromJsonString(long jsonColumnHandle); + +} diff --git a/src/test/java/com/nvidia/spark/rapids/jni/MapUtilsTest.java b/src/test/java/com/nvidia/spark/rapids/jni/MapUtilsTest.java new file mode 100644 index 00000000000..773ef7ac375 --- /dev/null +++ b/src/test/java/com/nvidia/spark/rapids/jni/MapUtilsTest.java @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +package com.nvidia.spark.rapids.jni; + +import ai.rapids.cudf.ColumnVector; +import ai.rapids.cudf.BinaryOp; + +import org.junit.jupiter.api.Test; + +import static ai.rapids.cudf.AssertUtils.assertColumnsAreEqual; + +public class MapUtilsTest { + + @Test + void testFromJsonSimpleInput() { + String jsonString1 = "{\"Zipcode\" : 704 , \"ZipCodeType\" : \"STANDARD\" , \"City\" : \"PARC" + + " PARQUE\" , \"State\" : \"PR\"}"; + String jsonString2 = "{}"; + String jsonString3 = "{\"category\": \"reference\", \"index\": [4,{},null,{\"a\":[{ }, {}] } " + + "], \"author\": \"Nigel Rees\", \"title\": \"{}[], <=semantic-symbols-string\", " + + "\"price\": 8.95}"; + + try (ColumnVector input = + ColumnVector.fromStrings(jsonString1, jsonString2, null, jsonString3); + ColumnVector outputMap = MapUtils.extractRawMapFromJsonString(input); + + ColumnVector expectedKeys = ColumnVector.fromStrings("Zipcode", "ZipCodeType", "City", + "State", "category", "index", "author", "title", "price"); + ColumnVector expectedValues = ColumnVector.fromStrings("704", "STANDARD", "PARC PARQUE", + "PR", "reference", "[4,{},null,{\"a\":[{ }, {}] } ]", "Nigel Rees", "{}[], " + + "<=semantic-symbols-string", "8.95"); + ColumnVector expectedStructs = ColumnVector.makeStruct(expectedKeys, expectedValues); + ColumnVector expectedOffsets = ColumnVector.fromInts(0, 4, 4, 4, 9); + ColumnVector tmpMap = expectedStructs.makeListFromOffsets(4, expectedOffsets); + ColumnVector templateBitmask = ColumnVector.fromBoxedInts(1, 1, null, 1); + ColumnVector expectedMap = tmpMap.mergeAndSetValidity(BinaryOp.BITWISE_AND, + templateBitmask); + ) { + assertColumnsAreEqual(expectedMap, outputMap); + } + } + + @Test + void testFromJsonWithUTF8() { + String jsonString1 = "{\"Zipc\u00f3de\" : 704 , \"Z\u00edpCodeTyp\u00e9\" : \"STANDARD\" ," + + " \"City\" : \"PARC PARQUE\" , \"St\u00e2te\" : \"PR\"}"; + String jsonString2 = "{}"; + String jsonString3 = "{\"Zipc\u00f3de\" : 704 , \"Z\u00edpCodeTyp\u00e9\" : " + + "\"\uD867\uDE3D\" , " + "\"City\" : \"\uD83C\uDFF3\" , \"St\u00e2te\" : " + + "\"\uD83C\uDFF3\"}"; + + try (ColumnVector input = + ColumnVector.fromStrings(jsonString1, jsonString2, null, jsonString3); + ColumnVector outputMap = MapUtils.extractRawMapFromJsonString(input); + + ColumnVector expectedKeys = ColumnVector.fromStrings("Zipc\u00f3de", "Z\u00edpCodeTyp" + + "\u00e9", "City", "St\u00e2te", "Zipc\u00f3de", "Z\u00edpCodeTyp\u00e9", + "City", "St\u00e2te"); + ColumnVector expectedValues = ColumnVector.fromStrings("704", "STANDARD", "PARC PARQUE", + "PR", "704", "\uD867\uDE3D", "\uD83C\uDFF3", "\uD83C\uDFF3"); + ColumnVector expectedStructs = ColumnVector.makeStruct(expectedKeys, expectedValues); + ColumnVector expectedOffsets = ColumnVector.fromInts(0, 4, 4, 4, 8); + ColumnVector tmpMap = expectedStructs.makeListFromOffsets(4, expectedOffsets); + ColumnVector templateBitmask = ColumnVector.fromBoxedInts(1, 1, null, 1); + ColumnVector expectedMap = tmpMap.mergeAndSetValidity(BinaryOp.BITWISE_AND, + templateBitmask); + ) { + assertColumnsAreEqual(expectedMap, outputMap); + } + } +}