From 0ba4675056929b070082e4ac2aa8465da253469f Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Tue, 20 Sep 2022 07:29:01 +0200 Subject: [PATCH] Adds type inference and type conversion for leaf-columns to the nested JSON parser (#11574) Adds type inference and type conversion for leaf-columns to the nested JSON parser **Note to the reviewers**: It's important to note that we're talking about two different stages of quote-stripping here. 1. Including/excluding quotes in the tokenizer stage (currently always set to `true` using a `constexpr bool`) 2. Including/excluding quotes in the type conversion stage Currently, we always include quotes in the tokenizer stage (1), such that the type casting stage (2) can differentiate between string values and literals (e.g. `[true, "true"]`) and, based on the user-provided choice in `json_reader_options::keep_quotes`, can strip off the quotes or keep them in the values returned to the user. **In addition to adding type inference and type casting:** - Switches logic for inferring nested columns. Inferring any column with at least one nested item (list or struct) as that respective nested column, making all other _non-nested_ items of that column invalid. E.g., `[null,{"a":1},"foo"] => List> with struct col validity: 0, 1, 0` - Adds option for `keep_quotes` to differentiate between string values and numeric & literal values, like (`123.4`, `true`, `false`, `null`). - Migrated libcudf test to cudf test to avoid having large byte BLOBs in source file - Changing column order to match the behaviour of pandas and existing JSON lines reader. That is, column order corresponds to the order they were discovered in: `[{"b":1, "c":1}, {"a":1}] => order: ` - Support for escape sequences (see below) ## Performance comparison ### Tokenizer The following is a comparison of the **JSON tokenizer** stage before this PR and after: #### Before ``` # Benchmark Results ## json_tokenizer ### [0] Tesla V100-SXM2-32GB | string_size | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | |-------------------|---------|-----------|-------|-----------|-------|----------| | 2^20 = 1048576 | 2176x | 2.489 ms | 9.62% | 2.480 ms | 9.61% | 422.729M | | 2^21 = 2097152 | 1936x | 2.501 ms | 7.14% | 2.492 ms | 7.12% | 841.482M | | 2^22 = 4194304 | 1152x | 2.612 ms | 5.43% | 2.604 ms | 5.42% | 1.611G | | 2^23 = 8388608 | 1456x | 2.855 ms | 4.26% | 2.847 ms | 4.23% | 2.947G | | 2^24 = 16777216 | 1104x | 3.395 ms | 5.34% | 3.387 ms | 5.33% | 4.954G | | 2^25 = 33554432 | 560x | 4.410 ms | 2.25% | 4.402 ms | 2.25% | 7.623G | | 2^26 = 67108864 | 1552x | 6.482 ms | 2.23% | 6.473 ms | 2.22% | 10.367G | | 2^27 = 134217728 | 1435x | 10.430 ms | 2.70% | 10.422 ms | 2.70% | 12.879G | | 2^28 = 268435456 | 815x | 18.396 ms | 1.95% | 18.387 ms | 1.95% | 14.599G | | 2^29 = 536870912 | 15x | 34.389 ms | 0.42% | 34.381 ms | 0.42% | 15.615G | | 2^30 = 1073741824 | 11x | 66.097 ms | 0.20% | 66.088 ms | 0.20% | 16.247G | ``` #### After ``` # Benchmark Results ## json_tokenizer ### [0] Tesla V100-SXM2-32GB | string_size | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | |-------------------|---------|------------|--------|------------|--------|----------| | 2^20 = 1048576 | 1408x | 2.600 ms | 11.28% | 2.592 ms | 11.26% | 404.547M | | 2^21 = 2097152 | 800x | 2.838 ms | 7.68% | 2.829 ms | 7.67% | 741.243M | | 2^22 = 4194304 | 2752x | 3.719 ms | 9.24% | 3.710 ms | 9.23% | 1.130G | | 2^23 = 8388608 | 128x | 4.855 ms | 3.38% | 4.846 ms | 3.37% | 1.731G | | 2^24 = 16777216 | 720x | 7.029 ms | 4.67% | 7.021 ms | 4.66% | 2.390G | | 2^25 = 33554432 | 832x | 10.760 ms | 3.83% | 10.751 ms | 3.83% | 3.121G | | 2^26 = 67108864 | 576x | 17.961 ms | 2.86% | 17.953 ms | 2.86% | 3.738G | | 2^27 = 134217728 | 461x | 32.550 ms | 2.13% | 32.542 ms | 2.13% | 4.124G | | 2^28 = 268435456 | 243x | 61.813 ms | 1.60% | 61.805 ms | 1.60% | 4.343G | | 2^29 = 536870912 | 125x | 120.445 ms | 1.21% | 120.437 ms | 1.21% | 4.458G | | 2^30 = 1073741824 | 66x | 228.833 ms | 0.75% | 228.825 ms | 0.75% | 4.692G | ``` ### JSON Parser The overall parser performance is obviously impacted as we're now also doing type conversion instead of just returning string columns. #### Before ``` # Benchmark Results ## nested_json_gpu_parser ### [0] Tesla V100-SXM2-32GB | string_size | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | |-------------------|---------|------------|-------|------------|-------|----------| | 2^20 = 1048576 | 1040x | 7.361 ms | 5.61% | 7.353 ms | 5.61% | 142.614M | | 2^21 = 2097152 | 832x | 11.549 ms | 3.63% | 11.541 ms | 3.63% | 181.708M | | 2^22 = 4194304 | 740x | 20.264 ms | 2.98% | 20.257 ms | 2.98% | 207.054M | | 2^23 = 8388608 | 407x | 36.844 ms | 2.26% | 36.837 ms | 2.26% | 227.724M | | 2^24 = 16777216 | 80x | 75.590 ms | 1.95% | 75.582 ms | 1.95% | 221.974M | | 2^25 = 33554432 | 80x | 179.442 ms | 4.40% | 179.434 ms | 4.40% | 187.001M | | 2^26 = 67108864 | 40x | 379.821 ms | 0.98% | 379.815 ms | 0.98% | 176.688M | | 2^27 = 134217728 | 20x | 777.351 ms | 1.72% | 777.347 ms | 1.72% | 172.661M | | 2^28 = 268435456 | 10x | 1.550 s | 0.99% | 1.550 s | 0.99% | 173.212M | | 2^29 = 536870912 | 5x | 3.055 s | 0.41% | 3.055 s | 0.41% | 175.749M | | 2^30 = 1073741824 | 3x | 6.315 s | inf% | 6.315 s | inf% | 170.018M | ``` #### After ``` | string_size | Samples | CPU Time | Noise | GPU Time | Noise | Elem/s | |-------------------|---------|------------|-------|------------|-------|----------| | 2^20 = 1048576 | 1568x | 7.908 ms | 5.24% | 7.900 ms | 5.24% | 132.730M | | 2^21 = 2097152 | 576x | 12.235 ms | 3.24% | 12.228 ms | 3.24% | 171.509M | | 2^22 = 4194304 | 192x | 21.171 ms | 2.09% | 21.164 ms | 2.09% | 198.182M | | 2^23 = 8388608 | 96x | 38.990 ms | 1.96% | 38.983 ms | 1.96% | 215.188M | | 2^24 = 16777216 | 192x | 78.414 ms | 2.21% | 78.407 ms | 2.21% | 213.977M | | 2^25 = 33554432 | 81x | 187.007 ms | 6.47% | 187.000 ms | 6.47% | 179.435M | | 2^26 = 67108864 | 38x | 400.007 ms | 1.59% | 400.000 ms | 1.59% | 167.772M | | 2^27 = 134217728 | 19x | 801.575 ms | 1.29% | 801.571 ms | 1.29% | 167.443M | | 2^28 = 268435456 | 10x | 1.590 s | 0.42% | 1.590 s | 0.42% | 168.799M | | 2^29 = 536870912 | 5x | 3.150 s | 0.40% | 3.150 s | 0.40% | 170.456M | | 2^30 = 1073741824 | 3x | 6.402 s | inf% | 6.402 s | inf% | 167.712M | ``` ## Supported escape sequences: ``` \" represents the quotation mark character (U+0022). \\ represents the reverse solidus character (U+005C). \/ represents the solidus character (U+002F). \b represents the backspace character (U+0008). \f represents the form feed character (U+000C). \n represents the line feed character (U+000A). \r represents the carriage return character (U+000D). \t represents the character tabulation character (U+0009). \uDDDD, where `D` is a hex digit 0-9, a-f, A-F, for code points on the MBP \uDDDD\uDDDD, where `D` is a hex digit 0-9, a-f, A-F, representing UTF-16 surrogate pairs for remaining unicode code points ``` Authors: - Elias Stehle (https://github.com/elstehle) - Vukasin Milovanovic (https://github.com/vuule) - Yunsong Wang (https://github.com/PointKernel) - Karthikeyan (https://github.com/karthikeyann) Approvers: - Michael Wang (https://github.com/isVoid) - Karthikeyan (https://github.com/karthikeyann) - Vukasin Milovanovic (https://github.com/vuule) URL: https://github.com/rapidsai/cudf/pull/11574 --- cpp/include/cudf/io/json.hpp | 31 ++++ cpp/src/io/json/nested_json.hpp | 73 +-------- cpp/src/io/json/nested_json_gpu.cu | 246 +++++++++++++++++++++++----- cpp/tests/io/json_test.cpp | 63 +++++++ cpp/tests/io/nested_json_test.cpp | 148 ++++++----------- python/cudf/cudf/tests/test_json.py | 48 +++++- 6 files changed, 404 insertions(+), 205 deletions(-) diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 73724b99589..aa7dca0dad3 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -83,6 +83,9 @@ class json_reader_options { // Whether to use the experimental reader bool _experimental = false; + // Whether to keep the quote characters of string values + bool _keep_quotes = false; + /** * @brief Constructor from source info. * @@ -203,6 +206,13 @@ class json_reader_options { */ bool is_enabled_experimental() const { return _experimental; } + /** + * @brief Whether the experimental reader should keep quotes of string values. + * + * @returns true if the experimental reader should keep quotes, false otherwise + */ + bool is_enabled_keep_quotes() const { return _keep_quotes; } + /** * @brief Set data types for columns to be read. * @@ -258,6 +268,14 @@ class json_reader_options { * @param val Boolean value to enable/disable the experimental reader */ void enable_experimental(bool val) { _experimental = val; } + + /** + * @brief Set whether the experimental reader should keep quotes of string values. + * + * @param val Boolean value to indicate whether the experimental reader should keep quotes + * of string values + */ + void enable_keep_quotes(bool val) { _keep_quotes = val; } }; /** @@ -377,6 +395,19 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether the experimental reader should keep quotes of string values. + * + * @param val Boolean value to indicate whether the experimental reader should keep quotes + * of string values + * @return this for chaining + */ + json_reader_options_builder& keep_quotes(bool val) + { + options._keep_quotes = val; + return *this; + } + /** * @brief move json_reader_options member once it's built. */ diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index fca9a3ecc42..dccd6a81e28 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -21,6 +21,7 @@ #include #include #include +#include #include #include @@ -127,6 +128,7 @@ struct json_column { // Following "items" as the default child column's name of a list column // Using the struct's field names std::map child_columns; + std::vector column_order; // Counting the current number of items in this column row_offset_t current_offset = 0; @@ -142,19 +144,7 @@ struct json_column { * * @param up_to_row_offset The row offset up to which to fill with nulls. */ - void null_fill(row_offset_t up_to_row_offset) - { - // Fill all the rows up to up_to_row_offset with "empty"/null rows - validity.resize(word_index(up_to_row_offset) + 1); - std::fill_n(std::back_inserter(string_offsets), - up_to_row_offset - string_offsets.size(), - (string_offsets.size() > 0) ? string_offsets.back() : 0); - std::fill_n(std::back_inserter(string_lengths), up_to_row_offset - string_lengths.size(), 0); - std::fill_n(std::back_inserter(child_offsets), - up_to_row_offset + 1 - child_offsets.size(), - (child_offsets.size() > 0) ? child_offsets.back() : 0); - current_offset = up_to_row_offset; - } + void null_fill(row_offset_t up_to_row_offset); /** * @brief Recursively iterates through the tree of columns making sure that all child columns of a @@ -162,26 +152,7 @@ struct json_column { * * @param min_row_count The minimum number of rows to be filled. */ - void level_child_cols_recursively(row_offset_t min_row_count) - { - // Fill this columns with nulls up to the given row count - null_fill(min_row_count); - - // If this is a struct column, we need to level all its child columns - if (type == json_col_t::StructColumn) { - for (auto it = std::begin(child_columns); it != std::end(child_columns); it++) { - it->second.level_child_cols_recursively(min_row_count); - } - } - // If this is a list column, we need to make sure that its child column levels its children - else if (type == json_col_t::ListColumn) { - auto it = std::begin(child_columns); - // Make that child column fill its child columns up to its own row count - if (it != std::end(child_columns)) { - it->second.level_child_cols_recursively(it->second.current_offset); - } - } - } + void level_child_cols_recursively(row_offset_t min_row_count); /** * @brief Appends the row at the given index to the column, filling all rows between the column's @@ -195,42 +166,10 @@ struct json_column { * the offsets */ void append_row(uint32_t row_index, - json_col_t const& row_type, + json_col_t row_type, uint32_t string_offset, uint32_t string_end, - uint32_t child_count) - { - // If, thus far, the column's type couldn't be inferred, we infer it to the given type - if (type == json_col_t::Unknown) { type = row_type; } - - // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type - // CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); - - // Fill all the omitted rows with "empty"/null rows (if needed) - null_fill(row_index); - - // Table listing what we intend to use for a given column type and row type combination - // col type | row type => {valid, FAIL, null} - // ----------------------------------------------- - // List | List => valid - // List | Struct => FAIL - // List | String => null - // Struct | List => FAIL - // Struct | Struct => valid - // Struct | String => null - // String | List => null - // String | Struct => null - // String | String => valid - bool const is_valid = (type == row_type); - if (static_cast(validity.size()) < word_index(current_offset)) - validity.push_back({}); - set_bit_unsafe(&validity.back(), intra_word_index(current_offset)); - valid_count += (is_valid) ? 1U : 0U; - string_offsets.push_back(string_offset); - string_lengths.push_back(string_end - string_offset); - child_offsets.push_back((child_offsets.size() > 0) ? child_offsets.back() + child_count : 0); - current_offset++; - }; + uint32_t child_count); }; /** diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 4c525caa3c8..de814cb5358 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -18,12 +18,14 @@ #include #include -#include +#include +#include #include #include #include #include +#include #include #include #include @@ -31,10 +33,12 @@ #include #include +#include #include #include #include +#include #include #include @@ -130,9 +134,9 @@ std::array, TT_NUM_STATES> const trans // Translation table (i.e., for each transition, what are the symbols that we output) std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{ {/* IN_STATE { [ } ] " \ OTHER */ - /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {'x'}, {'x'}, {'x'}}}, - /* TT_STR */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}, - /* TT_ESC */ {{{'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}, {'x'}}}}}; + /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}}}, + /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}}}, + /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}}}}}; // The DFA's starting state constexpr auto start_state = static_cast(TT_OOS); @@ -915,14 +919,107 @@ struct JSONToStackOp { } }; +void json_column::null_fill(row_offset_t up_to_row_offset) +{ + // Fill all the rows up to up_to_row_offset with "empty"/null rows + validity.resize(word_index(up_to_row_offset) + 1); + std::fill_n(std::back_inserter(string_offsets), + up_to_row_offset - string_offsets.size(), + (string_offsets.size() > 0) ? string_offsets.back() : 0); + std::fill_n(std::back_inserter(string_lengths), up_to_row_offset - string_lengths.size(), 0); + std::fill_n(std::back_inserter(child_offsets), + up_to_row_offset + 1 - child_offsets.size(), + (child_offsets.size() > 0) ? child_offsets.back() : 0); + current_offset = up_to_row_offset; +} + +void json_column::level_child_cols_recursively(row_offset_t min_row_count) +{ + // Fill this columns with nulls up to the given row count + null_fill(min_row_count); + + // If this is a struct column, we need to level all its child columns + if (type == json_col_t::StructColumn) { + for (auto it = std::begin(child_columns); it != std::end(child_columns); it++) { + it->second.level_child_cols_recursively(min_row_count); + } + } + // If this is a list column, we need to make sure that its child column levels its children + else if (type == json_col_t::ListColumn) { + auto it = std::begin(child_columns); + // Make that child column fill its child columns up to its own row count + if (it != std::end(child_columns)) { + it->second.level_child_cols_recursively(it->second.current_offset); + } + } +}; + +void json_column::append_row(uint32_t row_index, + json_col_t row_type, + uint32_t string_offset, + uint32_t string_end, + uint32_t child_count) +{ + // If, thus far, the column's type couldn't be inferred, we infer it to the given type + if (type == json_col_t::Unknown) { + type = row_type; + } + // If, at some point within a column, we encounter a nested type (list or struct), + // we change that column's type to that respective nested type and invalidate all previous rows + else if (type == json_col_t::StringColumn && + (row_type == json_col_t::ListColumn || row_type == json_col_t::StructColumn)) { + // Change the column type + type = row_type; + + // Invalidate all previous entries, as they were _not_ of the nested type to which we just + // converted + std::fill_n(validity.begin(), validity.size(), 0); + valid_count = 0U; + } + // If this is a nested column but we're trying to insert either (a) a list node into a struct + // column or (b) a struct node into a list column, we fail + CUDF_EXPECTS(not((type == json_col_t::ListColumn and row_type == json_col_t::StructColumn) or + (type == json_col_t::StructColumn and row_type == json_col_t::ListColumn)), + "A mix of lists and structs within the same column is not supported"); + + // We shouldn't run into this, as we shouldn't be asked to append an "unknown" row type + CUDF_EXPECTS(type != json_col_t::Unknown, "Encountered invalid JSON token sequence"); + + // Fill all the omitted rows with "empty"/null rows (if needed) + null_fill(row_index); + + // Table listing what we intend to use for a given column type and row type combination + // col type | row type => {valid, FAIL, null} + // ----------------------------------------------- + // List | List => valid + // List | Struct => FAIL + // List | String => null + // Struct | List => FAIL + // Struct | Struct => valid + // Struct | String => null + // String | List => valid (we switch col type to list, null'ing all previous rows) + // String | Struct => valid (we switch col type to list, null'ing all previous rows) + // String | String => valid + bool const is_valid = (type == row_type); + if (static_cast(validity.size()) < word_index(current_offset)) validity.push_back({}); + if (is_valid) { set_bit_unsafe(&validity.back(), intra_word_index(current_offset)); } + valid_count += (is_valid) ? 1U : 0U; + string_offsets.push_back(string_offset); + string_lengths.push_back(string_end - string_offset); + child_offsets.push_back((child_offsets.size() > 0) ? child_offsets.back() + child_count : 0); + current_offset++; +}; + namespace detail { void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, rmm::cuda_stream_view stream) { + // Range of encapsulating function that comprises: + // -> DFA simulation for filtering out brackets and braces inside of quotes + // -> Logical stack to infer the stack context CUDF_FUNC_RANGE(); - constexpr std::size_t single_item = 1; // Symbol representing the JSON-root (i.e., we're at nesting level '0') constexpr StackSymbolT root_symbol = '_'; @@ -930,7 +1027,7 @@ void get_stack_context(device_span json_in, constexpr StackSymbolT read_symbol = 'x'; // Number of stack operations in the input (i.e., number of '{', '}', '[', ']' outside of quotes) - hostdevice_vector num_stack_ops(single_item, stream); + rmm::device_scalar d_num_stack_ops(stream); // Sequence of stack symbols and their position in the original input (sparse representation) rmm::device_uvector stack_ops{json_in.size(), stream}; @@ -953,14 +1050,17 @@ void get_stack_context(device_span json_in, static_cast(json_in.size()), stack_ops.data(), stack_op_indices.data(), - num_stack_ops.device_ptr(), + d_num_stack_ops.data(), to_stack_op::start_state, stream); + // Copy back to actual number of stack operations + auto const num_stack_ops = d_num_stack_ops.value(stream); + // stack operations with indices are converted to top of the stack for each character in the input fst::sparse_stack_op_to_top_of_stack( stack_ops.data(), - device_span{stack_op_indices.data(), stack_op_indices.size()}, + device_span{stack_op_indices.data(), num_stack_ops}, JSONToStackOp{}, d_top_of_stack, root_symbol, @@ -975,7 +1075,9 @@ std::pair, rmm::device_uvector> ge rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of encapsulating function that parses to internal columnar data representation CUDF_FUNC_RANGE(); + rmm::device_uvector tokens{json_in.size(), stream, mr}; rmm::device_uvector tokens_indices{json_in.size(), stream, mr}; rmm::device_scalar num_written_tokens{stream, mr}; @@ -1039,6 +1141,8 @@ std::pair, rmm::device_uvector> ge * @param[in] input The JSON input in host memory * @param[in] d_input The JSON input in device memory * @param[in] options Parsing options specifying the parsing behaviour + * @param[in] include_quote_char Whether to include the original quote chars around string values, + * allowing to distinguish string values from numeric and literal values * @param[in] stream The CUDA stream to which kernels are dispatched * @param[in] mr Optional, resource with which to allocate * @return The columnar representation of the data from the given JSON input @@ -1048,13 +1152,15 @@ void make_json_column(json_column& root_column, host_span input, device_span d_input, cudf::io::json_reader_options const& options, + bool include_quote_char, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) { + // Range of encapsulating function that parses to internal columnar data representation CUDF_FUNC_RANGE(); + // Default name for a list's child column std::string const list_child_name = "element"; - constexpr bool include_quote_char = false; // TODO if merge conflict with PR #11574, make it true // Parse the JSON and get the token stream const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr); @@ -1090,14 +1196,17 @@ void make_json_column(json_column& root_column, }; }; - // Includes quote char for end-of-string token or Skips the quote char for beginning-of-field-name + // Depending on whether we want to include the quotes of strings or not, respectively, we: + // (a) strip off the beginning quote included in StringBegin and FieldNameBegin or + // (b) include of the end quote excluded from in StringEnd and strip off the beginning quote + // included FieldNameBegin auto get_token_index = [include_quote_char](PdaTokenT const token, SymbolOffsetT const token_index) { constexpr SymbolOffsetT quote_char_size = 1; switch (token) { - // Strip off or include quote char for StringBegin + // Optionally strip off quote char included for StringBegin case token_t::StringBegin: return token_index + (include_quote_char ? 0 : quote_char_size); - // Strip off or Include trailing quote char for string values for StringEnd + // Optionally include trailing quote char for string values excluded for StringEnd case token_t::StringEnd: return token_index + (include_quote_char ? quote_char_size : 0); // Strip off quote char included for FieldNameBegin case token_t::FieldNameBegin: return token_index + quote_char_size; @@ -1187,6 +1296,7 @@ void make_json_column(json_column& root_column, if (current_data_path.top().column->child_columns.size() == 0) { current_data_path.top().column->child_columns.emplace(std::string{list_child_name}, json_column{json_col_t::Unknown}); + current_data_path.top().column->column_order.push_back(list_child_name); } current_data_path.top().current_selected_col = ¤t_data_path.top().column->child_columns.begin()->second; @@ -1226,6 +1336,7 @@ void make_json_column(json_column& root_column, // The field name's column does not exist yet, so we have to append the child column to the // struct column + struct_col->column_order.push_back(field_name); return &struct_col->child_columns.emplace(field_name, json_column{}).first->second; }; @@ -1419,17 +1530,36 @@ void make_json_column(json_column& root_column, root_column.level_child_cols_recursively(root_column.current_offset); } +/** + * @brief Retrieves the parse_options to be used for type inference and type casting + * + * @param options The reader options to influence the relevant type inference and type casting + * options + */ +auto parsing_options(cudf::io::json_reader_options const& options) +{ + auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'}; + + auto const stream = cudf::default_stream_value; + parse_opts.keepquotes = options.is_enabled_keep_quotes(); + parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); + parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); + parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); + return parse_opts; +} + std::pair, std::vector> json_column_to_cudf_column( json_column const& json_col, device_span d_input, + cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of orchestrating/encapsulating function CUDF_FUNC_RANGE(); + auto make_validity = [stream, mr](json_column const& json_col) -> std::pair { - if (json_col.current_offset == json_col.valid_count) { return {rmm::device_buffer{}, 0}; } - return {rmm::device_buffer{json_col.validity.data(), bitmask_allocation_size_bytes(json_col.current_offset), stream, @@ -1439,29 +1569,58 @@ std::pair, std::vector> json_column_to switch (json_col.type) { case json_col_t::StringColumn: { - // move string_offsets to GPU and transform to string column - auto const col_size = json_col.string_offsets.size(); - using char_length_pair_t = thrust::pair; + auto const col_size = json_col.string_offsets.size(); CUDF_EXPECTS(json_col.string_offsets.size() == json_col.string_lengths.size(), "string offset, string length mismatch"); - rmm::device_uvector d_string_data(col_size, stream); + + // Move string_offsets and string_lengths to GPU rmm::device_uvector d_string_offsets = cudf::detail::make_device_uvector_async(json_col.string_offsets, stream); rmm::device_uvector d_string_lengths = cudf::detail::make_device_uvector_async(json_col.string_lengths, stream); + + // Prepare iterator that returns (string_offset, string_length)-tuples auto offset_length_it = thrust::make_zip_iterator(d_string_offsets.begin(), d_string_lengths.begin()); - thrust::transform(rmm::exec_policy(stream), - offset_length_it, - offset_length_it + col_size, - d_string_data.data(), - [data = d_input.data()] __device__(auto ip) { - return char_length_pair_t{data + thrust::get<0>(ip), thrust::get<1>(ip)}; - }); - auto str_col_ptr = make_strings_column(d_string_data, stream, mr); - auto [result_bitmask, null_count] = make_validity(json_col); - str_col_ptr->set_null_mask(result_bitmask, null_count); - return {std::move(str_col_ptr), {{"offsets"}, {"chars"}}}; + + // Prepare iterator that returns (string_offset, string_length)-pairs needed by inference + auto string_ranges_it = + thrust::make_transform_iterator(offset_length_it, [] __device__(auto ip) { + return thrust::pair{ + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; + }); + + // Prepare iterator that returns (string_ptr, string_length)-pairs needed by type conversion + auto string_spans_it = thrust::make_transform_iterator( + offset_length_it, [data = d_input.data()] __device__(auto ip) { + return thrust::pair{ + data + thrust::get<0>(ip), static_cast(thrust::get<1>(ip))}; + }); + + // Infer column type + auto target_type = cudf::io::detail::infer_data_type( + parsing_options(options).json_view(), d_input, string_ranges_it, col_size, stream); + + // Convert strings to the inferred data type + auto col = experimental::detail::parse_data(string_spans_it, + col_size, + target_type, + make_validity(json_col).first, + parsing_options(options).view(), + stream, + mr); + + // Reset nullable if we do not have nulls + if (col->null_count() == 0) { col->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); } + + // For string columns return ["offsets", "char"] schema + if (target_type.id() == type_id::STRING) { + return {std::move(col), {{"offsets"}, {"chars"}}}; + } + // Non-string leaf-columns (e.g., numeric) do not have child columns in the schema + else { + return {std::move(col), {}}; + } break; } case json_col_t::StructColumn: { @@ -1469,10 +1628,12 @@ std::pair, std::vector> json_column_to std::vector column_names{}; size_type num_rows{json_col.current_offset}; // Create children columns - for (auto const& col : json_col.child_columns) { - column_names.emplace_back(col.first); - auto const& child_col = col.second; - auto [child_column, names] = json_column_to_cudf_column(child_col, d_input, stream, mr); + for (auto const& col_name : json_col.column_order) { + auto const& col = json_col.child_columns.find(col_name); + column_names.emplace_back(col->first); + auto const& child_col = col->second; + auto [child_column, names] = + json_column_to_cudf_column(child_col, d_input, options, stream, mr); CUDF_EXPECTS(num_rows == child_column->size(), "All children columns must have the same size"); child_columns.push_back(std::move(child_column)); @@ -1496,8 +1657,8 @@ std::pair, std::vector> json_column_to auto offsets_column = std::make_unique(data_type{type_id::INT32}, num_rows, d_offsets.release()); // Create children column - auto [child_column, names] = - json_column_to_cudf_column(json_col.child_columns.begin()->second, d_input, stream, mr); + auto [child_column, names] = json_column_to_cudf_column( + json_col.child_columns.begin()->second, d_input, options, stream, mr); column_names.back().children = names; auto [result_bitmask, null_count] = make_validity(json_col); return {make_lists_column(num_rows - 1, @@ -1521,7 +1682,9 @@ table_with_metadata parse_nested_json(host_span input, rmm::cuda_stream_view stream, rmm::mr::device_memory_resource* mr) { + // Range of orchestrating/encapsulating function CUDF_FUNC_RANGE(); + auto const new_line_delimited_json = options.is_enabled_lines(); // Allocate device memory for the JSON input & copy over to device @@ -1536,6 +1699,10 @@ table_with_metadata parse_nested_json(host_span input, constexpr uint32_t token_end_offset_zero = 0; constexpr uint32_t node_init_child_count_zero = 0; + // Whether the tokenizer stage should keep quote characters for string values + // If the tokenizer keeps the quote characters, they may be stripped during type casting + constexpr bool include_quote_chars = true; + // We initialize the very root node and root column, which represent the JSON document being // parsed. That root node is a list node and that root column is a list column. The column has the // root node as its only row. The values parsed from the JSON input will be treated as follows: @@ -1549,7 +1716,8 @@ table_with_metadata parse_nested_json(host_span input, // Push the root node onto the stack for the data path data_path.push({&root_column, row_offset_zero, nullptr, node_init_child_count_zero}); - make_json_column(root_column, data_path, input, d_input, options, stream, mr); + make_json_column( + root_column, data_path, input, d_input, options, include_quote_chars, stream, mr); // data_root refers to the root column of the data represented by the given JSON string auto const& data_root = @@ -1570,12 +1738,14 @@ table_with_metadata parse_nested_json(host_span input, std::vector out_column_names; // Iterate over the struct's child columns and convert to cudf column - for (auto const& [col_name, json_col] : root_struct_col.child_columns) { + for (auto const& col_name : root_struct_col.column_order) { + auto const& json_col = root_struct_col.child_columns.find(col_name)->second; // Insert this columns name into the schema out_column_names.emplace_back(col_name); // Get this JSON column's cudf column and schema info - auto [cudf_col, col_name_info] = json_column_to_cudf_column(json_col, d_input, stream, mr); + auto [cudf_col, col_name_info] = + json_column_to_cudf_column(json_col, d_input, options, stream, mr); out_column_names.back().children = std::move(col_name_info); out_columns.emplace_back(std::move(cudf_col)); } diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 232aaa51ef3..7f698774084 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1006,4 +1006,67 @@ TEST_F(JsonReaderTest, ExperimentalLinesNoOmissions) } } +TEST_F(JsonReaderTest, TestColumnOrder) +{ + std::string const json_string = + // Expected order: + // root: b, c, a, d + // a: 2, 0, 1 + {R"({"b":"b0"} + {"c":"c1","a":{"2":null}} + {"d":"d2","a":{"0":"a2.0", "2":"a2.2"}} + {"b":"b3","a":{"1":null, "2":"a3.2"}})"}; + + std::vector const root_col_names{"b", "c", "a", "d"}; + std::vector const a_child_col_names{"2", "0", "1"}; + + // Initialize parsing options (reading json lines) + cudf::io::json_reader_options json_lines_options = + cudf::io::json_reader_options::builder( + cudf::io::source_info{json_string.c_str(), json_string.size()}) + .lines(true) + .experimental(true); + + // Read in data using nested JSON reader + cudf::io::table_with_metadata new_reader_table = cudf::io::read_json(json_lines_options); + + // Verify root column order (assert to avoid OOB access) + ASSERT_EQ(new_reader_table.metadata.schema_info.size(), root_col_names.size()); + + for (std::size_t i = 0; i < a_child_col_names.size(); i++) { + auto const& root_col_name = root_col_names[i]; + EXPECT_EQ(new_reader_table.metadata.schema_info[i].name, root_col_name); + } + + // Verify nested child column order (assert to avoid OOB access) + ASSERT_EQ(new_reader_table.metadata.schema_info[2].children.size(), a_child_col_names.size()); + for (std::size_t i = 0; i < a_child_col_names.size(); i++) { + auto const& a_child_col_name = a_child_col_names[i]; + EXPECT_EQ(new_reader_table.metadata.schema_info[2].children[i].name, a_child_col_name); + } + + // Verify data of root columns + ASSERT_EQ(root_col_names.size(), new_reader_table.tbl->num_columns()); + column_wrapper root_col_data_b{{"b0", "", "", "b3"}, + {true, false, false, true}}; + column_wrapper root_col_data_c{{"", "c1", "", ""}, + {false, true, false, false}}; + column_wrapper root_col_data_d{{"", "", "d2", ""}, + {false, false, true, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_b, new_reader_table.tbl->get_column(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_c, new_reader_table.tbl->get_column(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(root_col_data_d, new_reader_table.tbl->get_column(3)); + + // Verify data of child columns of column 'a' + auto const col_a = new_reader_table.tbl->get_column(2); + ASSERT_EQ(a_child_col_names.size(), col_a.num_children()); + column_wrapper col_a2{{"", "", "a2.2", "a3.2"}, {false, false, true, true}}; + column_wrapper col_a0{{"", "", "a2.0", ""}, {false, false, true, false}}; + // col a.1 is inferred as all-null + int8_wrapper col_a1{{0, 0, 0, 0}, {false, false, false, false}}; + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a2, col_a.child(0)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a0, col_a.child(1)); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_a1, col_a.child(2)); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index 55364ca7e9d..7f64f1191cf 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -470,8 +470,8 @@ TEST_F(JsonTest, ExtractColumn) auto const second_column_index = 1; EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); - auto expected_col1 = cudf::test::strings_column_wrapper({"0.0", "0.1", "0.2"}); - auto expected_col2 = cudf::test::strings_column_wrapper({"1.0", "1.1", "1.2"}); + auto expected_col1 = cudf::test::fixed_width_column_wrapper({0.0, 0.1, 0.2}); + auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 1.2}); cudf::column_view parsed_col1 = cudf_table.tbl->get_column(first_column_index); CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); cudf::column_view parsed_col2 = cudf_table.tbl->get_column(second_column_index); @@ -519,110 +519,64 @@ TEST_F(JsonTest, UTF_JSON) CUDF_EXPECT_NO_THROW(cuio_json::detail::parse_nested_json(utf_pass, default_options, stream)); } -TEST_F(JsonTest, DISABLED_FromParquet) +TEST_F(JsonTest, ExtractColumnWithQuotes) { using cuio_json::SymbolT; - std::string const input = - R"([{"0":{},"1":[],"2":{}},{"1":[[""],[]],"2":{"2":""}},{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}])"; - // Prepare cuda stream for data transfers & kernels constexpr auto stream = cudf::default_stream_value; // Default parsing options - cudf::io::json_reader_options default_options{}; + cudf::io::json_reader_options options{}; + options.enable_keep_quotes(true); - // Binary parquet data containing the same data as the data represented by the JSON string. - // We could add a dataset to include this file, but we don't want tests in cudf to have data. - const unsigned char parquet_data[] = { - 0x50, 0x41, 0x52, 0x31, 0x15, 0x00, 0x15, 0x18, 0x15, 0x18, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, - 0x06, 0x15, 0x06, 0x00, 0x00, 0x03, 0x00, 0x00, 0x00, 0x03, 0x21, 0x00, 0x01, 0x00, 0x00, 0x00, - 0x31, 0x15, 0x00, 0x15, 0x24, 0x15, 0x20, 0x2C, 0x15, 0x08, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, - 0x00, 0x00, 0x12, 0x18, 0x03, 0x00, 0x00, 0x00, 0x03, 0x10, 0x00, 0x05, 0x07, 0x04, 0x2D, 0x00, - 0x01, 0x01, 0x15, 0x00, 0x15, 0x22, 0x15, 0x22, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, 0x06, 0x15, - 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x04, 0x07, 0x00, 0x00, 0x00, 0x57, 0x26, 0x52, - 0x52, 0x3D, 0x2B, 0x49, 0x15, 0x00, 0x15, 0x14, 0x15, 0x14, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, - 0x06, 0x15, 0x06, 0x00, 0x00, 0x02, 0x00, 0x00, 0x00, 0x03, 0x04, 0x00, 0x00, 0x00, 0x00, 0x15, - 0x00, 0x15, 0x14, 0x15, 0x14, 0x2C, 0x15, 0x06, 0x15, 0x00, 0x15, 0x06, 0x15, 0x06, 0x00, 0x00, - 0x02, 0x00, 0x00, 0x00, 0x03, 0x02, 0x00, 0x00, 0x00, 0x00, 0x15, 0x02, 0x19, 0xCC, 0x48, 0x06, - 0x73, 0x63, 0x68, 0x65, 0x6D, 0x61, 0x15, 0x06, 0x00, 0x35, 0x02, 0x18, 0x01, 0x30, 0x15, 0x02, - 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x61, 0x25, 0x00, 0x00, 0x35, 0x02, 0x18, 0x01, 0x31, - 0x15, 0x02, 0x15, 0x06, 0x00, 0x35, 0x04, 0x18, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x15, 0x02, 0x00, - 0x35, 0x00, 0x18, 0x07, 0x65, 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x15, 0x02, 0x15, 0x06, 0x00, - 0x35, 0x04, 0x18, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x15, 0x02, 0x00, 0x15, 0x0C, 0x25, 0x00, 0x18, - 0x07, 0x65, 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x25, 0x00, 0x00, 0x35, 0x00, 0x18, 0x01, 0x32, - 0x15, 0x06, 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x30, 0x25, 0x00, 0x00, 0x15, 0x0C, 0x25, - 0x02, 0x18, 0x01, 0x31, 0x25, 0x00, 0x00, 0x15, 0x0C, 0x25, 0x02, 0x18, 0x01, 0x32, 0x25, 0x00, - 0x00, 0x16, 0x06, 0x19, 0x1C, 0x19, 0x5C, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, - 0x19, 0x28, 0x01, 0x30, 0x01, 0x61, 0x15, 0x00, 0x16, 0x06, 0x16, 0x3A, 0x16, 0x3A, 0x26, 0x08, - 0x3C, 0x36, 0x04, 0x28, 0x01, 0x31, 0x18, 0x01, 0x31, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, - 0x0C, 0x19, 0x25, 0x00, 0x06, 0x19, 0x58, 0x01, 0x31, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x07, 0x65, - 0x6C, 0x65, 0x6D, 0x65, 0x6E, 0x74, 0x04, 0x6C, 0x69, 0x73, 0x74, 0x07, 0x65, 0x6C, 0x65, 0x6D, - 0x65, 0x6E, 0x74, 0x15, 0x02, 0x16, 0x08, 0x16, 0x46, 0x16, 0x42, 0x26, 0x42, 0x3C, 0x36, 0x00, - 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, - 0x19, 0x28, 0x01, 0x32, 0x01, 0x30, 0x15, 0x00, 0x16, 0x06, 0x16, 0x44, 0x16, 0x44, 0x26, 0x84, - 0x01, 0x3C, 0x36, 0x04, 0x28, 0x07, 0x57, 0x26, 0x52, 0x52, 0x3D, 0x2B, 0x49, 0x18, 0x07, 0x57, - 0x26, 0x52, 0x52, 0x3D, 0x2B, 0x49, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, 0x15, 0x0C, 0x19, 0x25, - 0x00, 0x06, 0x19, 0x28, 0x01, 0x32, 0x01, 0x31, 0x15, 0x00, 0x16, 0x06, 0x16, 0x36, 0x16, 0x36, - 0x26, 0xC8, 0x01, 0x3C, 0x36, 0x04, 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, 0x00, 0x26, 0x00, 0x1C, - 0x15, 0x0C, 0x19, 0x25, 0x00, 0x06, 0x19, 0x28, 0x01, 0x32, 0x01, 0x32, 0x15, 0x00, 0x16, 0x06, - 0x16, 0x36, 0x16, 0x36, 0x26, 0xFE, 0x01, 0x3C, 0x36, 0x04, 0x28, 0x00, 0x18, 0x00, 0x00, 0x00, - 0x00, 0x16, 0xAC, 0x02, 0x16, 0x06, 0x00, 0x19, 0x1C, 0x18, 0x06, 0x70, 0x61, 0x6E, 0x64, 0x61, - 0x73, 0x18, 0xFE, 0x04, 0x7B, 0x22, 0x69, 0x6E, 0x64, 0x65, 0x78, 0x5F, 0x63, 0x6F, 0x6C, 0x75, - 0x6D, 0x6E, 0x73, 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, 0x6B, 0x69, 0x6E, 0x64, 0x22, 0x3A, 0x20, - 0x22, 0x72, 0x61, 0x6E, 0x67, 0x65, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, - 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, 0x20, 0x22, 0x73, 0x74, 0x61, 0x72, 0x74, 0x22, 0x3A, 0x20, - 0x30, 0x2C, 0x20, 0x22, 0x73, 0x74, 0x6F, 0x70, 0x22, 0x3A, 0x20, 0x33, 0x2C, 0x20, 0x22, 0x73, - 0x74, 0x65, 0x70, 0x22, 0x3A, 0x20, 0x31, 0x7D, 0x5D, 0x2C, 0x20, 0x22, 0x63, 0x6F, 0x6C, 0x75, - 0x6D, 0x6E, 0x5F, 0x69, 0x6E, 0x64, 0x65, 0x78, 0x65, 0x73, 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, - 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, 0x20, 0x22, 0x66, 0x69, - 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x2C, - 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, - 0x22, 0x75, 0x6E, 0x69, 0x63, 0x6F, 0x64, 0x65, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x75, 0x6D, 0x70, - 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, - 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x20, 0x7B, - 0x22, 0x65, 0x6E, 0x63, 0x6F, 0x64, 0x69, 0x6E, 0x67, 0x22, 0x3A, 0x20, 0x22, 0x55, 0x54, 0x46, - 0x2D, 0x38, 0x22, 0x7D, 0x7D, 0x5D, 0x2C, 0x20, 0x22, 0x63, 0x6F, 0x6C, 0x75, 0x6D, 0x6E, 0x73, - 0x22, 0x3A, 0x20, 0x5B, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x30, 0x22, - 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, - 0x22, 0x30, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, - 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6E, - 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, - 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, - 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x2C, 0x20, 0x7B, 0x22, 0x6E, 0x61, 0x6D, 0x65, 0x22, - 0x3A, 0x20, 0x22, 0x31, 0x22, 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, 0x6E, 0x61, - 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x31, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, - 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6C, 0x69, 0x73, 0x74, 0x5B, 0x6C, - 0x69, 0x73, 0x74, 0x5B, 0x75, 0x6E, 0x69, 0x63, 0x6F, 0x64, 0x65, 0x5D, 0x5D, 0x22, 0x2C, 0x20, - 0x22, 0x6E, 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, - 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, 0x74, 0x61, 0x64, 0x61, 0x74, - 0x61, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x2C, 0x20, 0x7B, 0x22, 0x6E, 0x61, 0x6D, - 0x65, 0x22, 0x3A, 0x20, 0x22, 0x32, 0x22, 0x2C, 0x20, 0x22, 0x66, 0x69, 0x65, 0x6C, 0x64, 0x5F, - 0x6E, 0x61, 0x6D, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x32, 0x22, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, - 0x64, 0x61, 0x73, 0x5F, 0x74, 0x79, 0x70, 0x65, 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, - 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6E, 0x75, 0x6D, 0x70, 0x79, 0x5F, 0x74, 0x79, 0x70, 0x65, - 0x22, 0x3A, 0x20, 0x22, 0x6F, 0x62, 0x6A, 0x65, 0x63, 0x74, 0x22, 0x2C, 0x20, 0x22, 0x6D, 0x65, - 0x74, 0x61, 0x64, 0x61, 0x74, 0x61, 0x22, 0x3A, 0x20, 0x6E, 0x75, 0x6C, 0x6C, 0x7D, 0x5D, 0x2C, - 0x20, 0x22, 0x63, 0x72, 0x65, 0x61, 0x74, 0x6F, 0x72, 0x22, 0x3A, 0x20, 0x7B, 0x22, 0x6C, 0x69, - 0x62, 0x72, 0x61, 0x72, 0x79, 0x22, 0x3A, 0x20, 0x22, 0x70, 0x79, 0x61, 0x72, 0x72, 0x6F, 0x77, - 0x22, 0x2C, 0x20, 0x22, 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x22, 0x3A, 0x20, 0x22, 0x38, - 0x2E, 0x30, 0x2E, 0x31, 0x22, 0x7D, 0x2C, 0x20, 0x22, 0x70, 0x61, 0x6E, 0x64, 0x61, 0x73, 0x5F, - 0x76, 0x65, 0x72, 0x73, 0x69, 0x6F, 0x6E, 0x22, 0x3A, 0x20, 0x22, 0x31, 0x2E, 0x34, 0x2E, 0x33, - 0x22, 0x7D, 0x00, 0x29, 0x5C, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C, 0x00, 0x00, 0x1C, 0x00, - 0x00, 0x1C, 0x00, 0x00, 0x00, 0x0B, 0x04, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31}; - - // Read in the data via parquet reader - cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder( - cudf::io::source_info{reinterpret_cast(parquet_data), sizeof(parquet_data)}); - auto result = cudf::io::read_parquet(read_opts); - - // Read in the data via the JSON parser + std::string const input = R"( [{"a":"0.0", "b":1.0}, {"b":1.1}, {"b":2.1, "a":"2.0"}] )"; + // Get the JSON's tree representation auto const cudf_table = cuio_json::detail::parse_nested_json( - cudf::host_span{input.data(), input.size()}, default_options, stream); + cudf::host_span{input.data(), input.size()}, options, stream); + + auto constexpr expected_col_count = 2; + EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count); + + auto expected_col1 = + cudf::test::strings_column_wrapper({R"("0.0")", R"()", R"("2.0")"}, {true, false, true}); + auto expected_col2 = cudf::test::fixed_width_column_wrapper({1.0, 1.1, 2.1}); + cudf::column_view parsed_col1 = cudf_table.tbl->get_column(0); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col1, parsed_col1); + cudf::column_view parsed_col2 = cudf_table.tbl->get_column(1); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); +} + +TEST_F(JsonTest, ExpectFailMixStructAndList) +{ + using cuio_json::SymbolT; + + // Prepare cuda stream for data transfers & kernels + constexpr auto stream = cudf::default_stream_value; - // Verify that the data read via parquet matches the data read via JSON - CUDF_TEST_EXPECT_TABLES_EQUAL(cudf_table.tbl->view(), result.tbl->view()); + // Default parsing options + cudf::io::json_reader_options options{}; + options.enable_keep_quotes(true); + + std::vector const inputs_fail{ + R"( [{"a":[123], "b":1.0}, {"b":1.1}, {"b":2.1, "a":{"0":123}}] )", + R"( [{"a":{"0":"foo"}, "b":1.0}, {"b":1.1}, {"b":2.1, "a":[123]}] )", + R"( [{"a":{"0":null}, "b":1.0}, {"b":1.1}, {"b":2.1, "a":[123]}] )"}; + + std::vector const inputs_succeed{ + R"( [{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}] )", + R"( [{"a":[123, "123"], "b":1.0}, {"b":1.1}, {"b":2.1}] )"}; + + for (auto const& input : inputs_fail) { + CUDF_EXPECT_THROW_MESSAGE( + auto const cudf_table = cuio_json::detail::parse_nested_json( + cudf::host_span{input.data(), input.size()}, options, stream), + "A mix of lists and structs within the same column is not supported"); + } - // Verify that the schema read via parquet matches the schema read via JSON - cudf::test::expect_metadata_equal(cudf_table.metadata, result.metadata); + for (auto const& input : inputs_succeed) { + CUDF_EXPECT_NO_THROW( + auto const cudf_table = cuio_json::detail::parse_nested_json( + cudf::host_span{input.data(), input.size()}, options, stream)); + } } diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py index f3d9180d44d..f6ca4691669 100644 --- a/python/cudf/cudf/tests/test_json.py +++ b/python/cudf/cudf/tests/test_json.py @@ -615,6 +615,48 @@ def test_json_nested_lines(data): ) bytes.seek(0) pdf = pd.read_json(bytes, orient="records", lines=True) - # In the second test-case: - # Pandas omits "f1" in first row, so we have to enforce a common schema - assert df.to_arrow().equals(pa.Table.from_pandas(pdf)) + # In the second test-case we need to take a detour via pyarrow + # Pandas omits "f1" in first row, so we have to enforce a common schema, + # such that pandas would have the f1 member with null + # Also, pyarrow chooses to select different ordering of a nested column + # children though key-value pairs are correct. + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +def test_json_nested_data(): + json_str = ( + '[{"0":{},"2":{}},{"1":[[""],[]],"2":{"2":""}},' + '{"0":{"a":"1"},"2":{"0":"W&RR=+I","1":""}}]' + ) + df = cudf.read_json( + StringIO(json_str), engine="cudf_experimental", orient="records" + ) + pdf = pd.read_json(StringIO(json_str), orient="records") + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf) + + +def test_json_types_data(): + # 0:<0:string,1:float> + # 1:list + # 2:<0:bool> + json_str = ( + '[{"0":null,"2":{}},' + '{"1":[123],"0":{"0":"foo","1":123.4},"2":{"0":false}},' + '{"0":{},"1":[],"2":{"0":null}}]' + ) + df = cudf.read_json( + StringIO(json_str), engine="cudf_experimental", orient="records" + ) + pdf = pd.read_json(StringIO(json_str), orient="records") + pdf.columns = pdf.columns.astype("str") + pa_table_pdf = pa.Table.from_pandas( + pdf, schema=df.to_arrow().schema, safe=False + ) + assert df.to_arrow().equals(pa_table_pdf)