From 2436e0bbcb44123d116f8a5b8a6169ffbb125589 Mon Sep 17 00:00:00 2001 From: Elias Stehle <3958403+elstehle@users.noreply.github.com> Date: Fri, 14 Jul 2023 18:30:50 +0200 Subject: [PATCH] [FEA] Adds option to recover from invalid JSON lines in JSON tokenizer (#13344) This PR adds the option to recover from invalid JSON lines to the JSON tokenizer. **New option and behaviour:** - We add the option `enable_recover_from_error` to `json_reader_options`. When this option is enabled for a JSON lines input, the reader will recover from a parsing error encountered on an invalid JSON line and continue parsing the next line. - When the new option is not enabled, we expect the behaviour of existing functionality to remain untouched. - When recovering from invalid JSON lines is enabled, all newline characters that are not enclosed in quotes (i.e., newline characters outside of `strings` and `field names`) are interpreted as delimiters of a JSON line. We will introduce a new option that reflects this behaviour for JSON lines inputs that should not recover from errors in a future PR. Hence, this PR introduces the `JSON_LINES_STRICT` enum but does not yet hook it up. **Implementation details:** - When recovering from invalid JSON lines is enabled, `get_token_stream()` will delimit each JSON line with a `LineEnd` token to facilitate the identification of tokens that belong to an invalid JSON line. - We extend the logical stack and introduce a new operation, `reset()`. A `reset()` operation resets the logical stack to an empty stack. This is necessary to reset the stack of the pushdown automaton (PDA) after an invalid JSON line to make sure the stack in subsequent lines is not corrupted. - We modify the transition and translation table of the finite-state transducer (FST) that is used to generate the push-down automaton's (PDA) stack context operations to emit such a `reset()` operation, iff `recovery` is enabled. - We modify the transition and translation table of the finite-state transducer (FST) that is used to simulate the full PDA to (1) recover after an invalid JSON line and (2) emit the `LineEnd` token, iff `recovery` is enabled. - To clean up JSON lines that contain tokens belonging to an invalid line, a token *post-processing* stage is needed. The *post-processing* will replace sequences of `LineEnd` `token*` `ErrorBegin` with the sequence `StructBegin` `StructEnd` (i.e., effectively a `null` row) for record orient inputs. - This post-processing is implemented by running an FST on the reverse token stream, discarding all tokens between `ErrorBegin` and the next `LineEnd`, emitting `StructBegin` `StructEnd` pairs on the end of such an invalid line. This is an initial PR to addresses https://github.com/rapidsai/cudf/issues/12532. Authors: - Elias Stehle (https://github.com/elstehle) Approvers: - Vukasin Milovanovic (https://github.com/vuule) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/13344 --- cpp/benchmarks/io/fst.cu | 31 +- cpp/include/cudf/io/detail/tokenize_json.hpp | 2 + cpp/include/cudf/io/json.hpp | 37 + cpp/src/io/fst/agent_dfa.cuh | 53 +- cpp/src/io/fst/logical_stack.cuh | 51 +- cpp/src/io/fst/lookup_tables.cuh | 520 +++++++-- cpp/src/io/json/nested_json.hpp | 31 + cpp/src/io/json/nested_json_gpu.cu | 1082 ++++++++++++------ cpp/tests/io/fst/fst_test.cu | 9 +- cpp/tests/io/json_test.cpp | 46 + cpp/tests/io/json_tree.cpp | 1 + cpp/tests/io/nested_json_test.cpp | 294 ++++- 12 files changed, 1603 insertions(+), 554 deletions(-) diff --git a/cpp/benchmarks/io/fst.cu b/cpp/benchmarks/io/fst.cu index 6b821eb5cae..c0c88517d41 100644 --- a/cpp/benchmarks/io/fst.cu +++ b/cpp/benchmarks/io/fst.cu @@ -67,10 +67,9 @@ auto make_test_json_data(nvbench::state& state) // Type used to represent the atomic symbol type used within the finite-state machine using SymbolT = char; // Type sufficiently large to index symbols within the input and output (may be unsigned) -using SymbolOffsetT = uint32_t; -// Helper class to set up transition table, symbol group lookup table, and translation table -using DfaFstT = cudf::io::fst::detail::Dfa; -constexpr std::size_t single_item = 1; +using SymbolOffsetT = uint32_t; +constexpr std::size_t single_item = 1; +constexpr auto max_translation_table_size = TT_NUM_STATES * NUM_SYMBOL_GROUPS; } // namespace @@ -94,7 +93,11 @@ void BM_FST_JSON(nvbench::state& state) cudf::detail::hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { @@ -129,7 +132,11 @@ void BM_FST_JSON_no_outidx(nvbench::state& state) cudf::detail::hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { @@ -162,7 +169,11 @@ void BM_FST_JSON_no_out(nvbench::state& state) cudf::detail::hostdevice_vector output_gpu_size(single_item, stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { @@ -196,7 +207,11 @@ void BM_FST_JSON_no_str(nvbench::state& state) cudf::detail::hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp index 4914f434c98..b2ea29a85c3 100644 --- a/cpp/include/cudf/io/detail/tokenize_json.hpp +++ b/cpp/include/cudf/io/detail/tokenize_json.hpp @@ -110,6 +110,8 @@ enum token_t : PdaTokenT { ValueEnd, /// Beginning-of-error token (on first encounter of a parsing error) ErrorBegin, + /// Delimiting a JSON line for error recovery + LineEnd, /// Total number of tokens NUM_TOKENS }; diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 670409a898a..15dc2a614ad 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -54,6 +54,14 @@ struct schema_element { std::map child_types; }; +/** + * @brief Control the error recovery behavior of the json parser + */ +enum class json_recovery_mode_t { + FAIL, ///< Does not recover from an error when encountering an invalid format + RECOVER_WITH_NULL ///< Recovers from an error, replacing invalid records with null +}; + /** * @brief Input arguments to the `read_json` interface. * @@ -105,6 +113,9 @@ class json_reader_options { // Whether to keep the quote characters of string values bool _keep_quotes = false; + // Whether to recover after an invalid JSON line + json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; + /** * @brief Constructor from source info. * @@ -235,6 +246,13 @@ class json_reader_options { */ bool is_enabled_keep_quotes() const { return _keep_quotes; } + /** + * @brief Queries the JSON reader's behavior on invalid JSON lines. + * + * @returns An enum that specifies the JSON reader's behavior on invalid JSON lines. + */ + json_recovery_mode_t recovery_mode() const { return _recovery_mode; } + /** * @brief Set data types for columns to be read. * @@ -305,6 +323,13 @@ class json_reader_options { * of string values */ void enable_keep_quotes(bool val) { _keep_quotes = val; } + + /** + * @brief Specifies the JSON reader's behavior on invalid JSON lines. + * + * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines. + */ + void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; } }; /** @@ -449,6 +474,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Specifies the JSON reader's behavior on invalid JSON lines. + * + * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines. + * @return this for chaining + */ + json_reader_options_builder& recovery_mode(json_recovery_mode_t val) + { + options._recovery_mode = val; + return *this; + } + /** * @brief move json_reader_options member once it's built. */ diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 0c813c7917f..52fd039c097 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -83,16 +83,18 @@ class DFASimulationCallbackWrapper { if (!write) out_count = 0; } - template + template __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, StateIndexT const old_state, StateIndexT const new_state, - SymbolIndexT const symbol_id) + SymbolIndexT const symbol_id, + SymbolT const read_symbol) { - uint32_t const count = transducer_table(old_state, symbol_id); + uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); if (write) { for (uint32_t out_char = 0; out_char < count; out_char++) { - out_it[out_count + out_char] = transducer_table(old_state, symbol_id, out_char); + out_it[out_count + out_char] = + transducer_table(old_state, symbol_id, out_char, read_symbol); out_idx_it[out_count + out_char] = offset + character_index; } } @@ -127,9 +129,10 @@ class StateVectorTransitionOp { { } - template + template __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, - SymbolIndexT const read_symbol_id) const + SymbolIndexT const& read_symbol_id, + SymbolT const& read_symbol) const { for (int32_t i = 0; i < NUM_INSTANCES; ++i) { state_vector[i] = transition_table(state_vector[i], read_symbol_id); @@ -154,15 +157,16 @@ struct StateTransitionOp { { } - template + template __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, - SymbolIndexT const& read_symbol_id) + SymbolIndexT const& read_symbol_id, + SymbolT const& read_symbol) { // Remember what state we were in before we made the transition StateIndexT previous_state = state; state = transition_table(state, read_symbol_id); - callback_op.ReadSymbol(character_index, previous_state, state, read_symbol_id); + callback_op.ReadSymbol(character_index, previous_state, state, read_symbol_id, read_symbol); } }; @@ -230,7 +234,7 @@ struct AgentDFA { for (int32_t i = 0; i < NUM_SYMBOLS; ++i) { if (IS_FULL_BLOCK || threadIdx.x * SYMBOLS_PER_THREAD + i < max_num_chars) { auto matched_id = symbol_matcher(chars[i]); - callback_op.ReadSymbol(i, matched_id); + callback_op.ReadSymbol(i, matched_id, chars[i]); } } } @@ -253,7 +257,8 @@ struct AgentDFA { //--------------------------------------------------------------------- // LOADING FULL BLOCK OF CHARACTERS, NON-ALIASED //--------------------------------------------------------------------- - __device__ __forceinline__ void LoadBlock(CharT const* d_chars, + template + __device__ __forceinline__ void LoadBlock(CharInItT d_chars, OffsetT const block_offset, OffsetT const num_total_symbols, cub::Int2Type /*IS_FULL_BLOCK*/, @@ -261,7 +266,7 @@ struct AgentDFA { { CharT thread_chars[SYMBOLS_PER_THREAD]; - CharT const* d_block_symbols = d_chars + block_offset; + CharInItT d_block_symbols = d_chars + block_offset; cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_chars); #pragma unroll @@ -273,7 +278,8 @@ struct AgentDFA { //--------------------------------------------------------------------- // LOADING PARTIAL BLOCK OF CHARACTERS, NON-ALIASED //--------------------------------------------------------------------- - __device__ __forceinline__ void LoadBlock(CharT const* d_chars, + template + __device__ __forceinline__ void LoadBlock(CharInItT d_chars, OffsetT const block_offset, OffsetT const num_total_symbols, cub::Int2Type /*IS_FULL_BLOCK*/, @@ -286,7 +292,7 @@ struct AgentDFA { // Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT) OffsetT num_total_chars = num_total_symbols - block_offset; - CharT const* d_block_symbols = d_chars + block_offset; + CharInItT d_block_symbols = d_chars + block_offset; cub::LoadDirectStriped( threadIdx.x, d_block_symbols, thread_chars, num_total_chars); @@ -372,11 +378,26 @@ struct AgentDFA { } } + template + __device__ __forceinline__ void LoadBlock(CharInItT d_chars, + OffsetT const block_offset, + OffsetT const num_total_symbols) + { + // Check if we are loading a full tile of data + if (block_offset + SYMBOLS_PER_UINT_BLOCK < num_total_symbols) { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<1>()); + } else { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<1>()); + } + } + template __device__ __forceinline__ void GetThreadStateTransitionVector( SymbolMatcherT const& symbol_matcher, TransitionTableT const& transition_table, - CharT const* d_chars, + SymbolItT d_chars, OffsetT const block_offset, OffsetT const num_total_symbols, std::array& state_vector) @@ -416,7 +437,7 @@ struct AgentDFA { __device__ __forceinline__ void GetThreadStateTransitions( SymbolMatcherT const& symbol_matcher, TransitionTableT const& transition_table, - CharT const* d_chars, + SymbolItT d_chars, OffsetT const block_offset, OffsetT const num_total_symbols, StateIndexT& state, diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 27ce6403ee8..a5d32cba125 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -42,9 +42,10 @@ namespace cudf::io::fst { * @brief Describes the kind of stack operation. */ enum class stack_op_type : int8_t { - READ = 0, ///< Operation reading what is currently on top of the stack - PUSH = 1, ///< Operation pushing a new item on top of the stack - POP = 2 ///< Operation popping the item currently on top of the stack + READ = 0, ///< Operation reading what is currently on top of the stack + PUSH = 1, ///< Operation pushing a new item on top of the stack + POP = 2, ///< Operation popping the item currently on top of the stack + RESET = 3 ///< Operation popping all items currently on the stack }; namespace detail { @@ -119,9 +120,9 @@ struct StackSymbolToStackOp { { stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol); // PUSH => +1, POP => -1, READ => 0 - int32_t level_delta = stack_op == stack_op_type::PUSH ? 1 - : stack_op == stack_op_type::POP ? -1 - : 0; + int32_t level_delta = (stack_op == stack_op_type::PUSH) ? 1 + : (stack_op == stack_op_type::POP) ? -1 + : 0; return StackOpT{static_cast(level_delta), stack_symbol}; } @@ -133,14 +134,20 @@ struct StackSymbolToStackOp { * @brief Binary reduction operator to compute the absolute stack level from relative stack levels * (i.e., +1 for a PUSH, -1 for a POP operation). */ +template struct AddStackLevelFromStackOp { template constexpr CUDF_HOST_DEVICE StackOp operator()( StackOp const& lhs, StackOp const& rhs) const { - StackLevelT new_level = lhs.stack_level + rhs.stack_level; + StackLevelT new_level = (symbol_to_stack_op_type(rhs.value) == stack_op_type::RESET) + ? 0 + : (lhs.stack_level + rhs.stack_level); return StackOp{new_level, rhs.value}; } + + /// Function object returning a stack operation type for a given stack symbol + StackSymbolToStackOpTypeT symbol_to_stack_op_type; }; /** @@ -323,13 +330,14 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, // Getting temporary storage requirements for the prefix sum of the stack level after each // operation - CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(nullptr, - stack_level_scan_bytes, - stack_symbols_in, - d_kv_operations.Current(), - detail::AddStackLevelFromStackOp{}, - num_symbols_in, - stream)); + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( + nullptr, + stack_level_scan_bytes, + stack_symbols_in, + d_kv_operations.Current(), + detail::AddStackLevelFromStackOp{symbol_to_stack_op}, + num_symbols_in, + stream)); // Getting temporary storage requirements for the stable radix sort (sorting by stack level of the // operations) @@ -393,13 +401,14 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, d_kv_operations = cub::DoubleBuffer{d_kv_ops_current.data(), d_kv_ops_alt.data()}; // Compute prefix sum of the stack level after each operation - CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(temp_storage.data(), - total_temp_storage_bytes, - stack_symbols_in, - d_kv_operations.Current(), - detail::AddStackLevelFromStackOp{}, - num_symbols_in, - stream)); + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( + temp_storage.data(), + total_temp_storage_bytes, + stack_symbols_in, + d_kv_operations.Current(), + detail::AddStackLevelFromStackOp{symbol_to_stack_op}, + num_symbols_in, + stream)); // Stable radix sort, sorting by stack level of the operations d_kv_operations_unsigned = cub::DoubleBuffer{ diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 26f6891d963..c4176d5673f 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -22,20 +22,40 @@ #include +#include + #include #include +#include #include namespace cudf::io::fst::detail { +/** + * @brief Helper function object that delegates a lookup to a given lookup table without mapping any + * of the given arguments. + */ +struct IdentityOp { + template + __host__ __device__ __forceinline__ auto operator()(LookUpTableT const& lookup_table, + Args&&... args) const + { + return lookup_table.lookup(std::forward(args)...); + } +}; + /** * @brief Class template that can be plugged into the finite-state machine to look up the symbol * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no * look-ahead). The class uses shared memory for the lookups. * * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id + * @tparam PreMapOpT A function object that is invoked with `(lut, symbol)` and must return the + * symbol group index of `symbol`. `lut` is an instance of the lookup table and `symbol` is the + * symbol for which to get the symbol group index. If no particular mapping is needed, an instance + * of `IdentityOp` can be used. */ -template +template class SingleSymbolSmemLUT { private: // Type used for representing a symbol group id (i.e., what we return for a given symbol) @@ -50,32 +70,36 @@ class SingleSymbolSmemLUT { }; public: + using TempStorage = cub::Uninitialized<_TempStorage>; + struct KernelParameter { + using LookupTableT = SingleSymbolSmemLUT; + // sym_to_sgid[min(symbol,num_valid_entries)] -> symbol group index - SymbolT num_valid_entries; + uint32_t num_valid_entries; // sym_to_sgid[symbol] -> symbol group index SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; - }; - using TempStorage = cub::Uninitialized<_TempStorage>; + // Function object that transforms a symbol to a symbol group id + PreMapOpT pre_map_op; + }; /** * @brief Initializes the given \p sgid_init with the symbol group lookups defined by \p * symbol_strings. * - * @param[out] sgid_init A hostdevice_vector that will be populated - * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols + * @param symbol_strings Array of strings, where the i-th string holds all symbols * (characters!) that correspond to the i-th symbol group index - * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table + * @param stream The stream that shall be used to cudaMemcpyAsync the lookup table * @return */ template - static void InitDeviceSymbolGroupIdLut( - cudf::detail::hostdevice_vector& sgid_init, - SymbolGroupItT const& symbol_strings, - rmm::cuda_stream_view stream) + static KernelParameter InitDeviceSymbolGroupIdLut(SymbolGroupItT const& symbol_strings, + PreMapOpT pre_map_op) { + KernelParameter init_data{}; + // The symbol group index to be returned if none of the given symbols match SymbolGroupIdT no_match_id = symbol_strings.size(); @@ -83,9 +107,7 @@ class SingleSymbolSmemLUT { SymbolGroupIdT max_base_match_val = 0; // Initialize all entries: by default we return the no-match-id - std::fill(&sgid_init.host_ptr()->sym_to_sgid[0], - &sgid_init.host_ptr()->sym_to_sgid[NUM_ENTRIES_PER_LUT], - no_match_id); + std::fill(&init_data.sym_to_sgid[0], &init_data.sym_to_sgid[NUM_ENTRIES_PER_LUT], no_match_id); // Set up lookup table uint32_t sg_id = 0; @@ -94,22 +116,24 @@ class SingleSymbolSmemLUT { // Iterate over all symbols that belong to the current symbol group for (auto const& sg_symbol : sg_symbols) { max_base_match_val = std::max(max_base_match_val, static_cast(sg_symbol)); - sgid_init.host_ptr()->sym_to_sgid[static_cast(sg_symbol)] = sg_id; + init_data.sym_to_sgid[static_cast(sg_symbol)] = sg_id; } sg_id++; } // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id - sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id; + init_data.sym_to_sgid[max_base_match_val + 1] = no_match_id; // Alias memory / return memory requirements - sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 1; + init_data.num_valid_entries = max_base_match_val + 1; + init_data.pre_map_op = pre_map_op; - sgid_init.host_to_device_async(stream); + return init_data; } _TempStorage& temp_storage; SymbolGroupIdT num_valid_entries; + PreMapOpT pre_map_op; __device__ __forceinline__ _TempStorage& PrivateStorage() { @@ -140,7 +164,14 @@ class SingleSymbolSmemLUT { #endif } - constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const + template + constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT_ const symbol) const + { + // Look up the symbol group for given symbol + return pre_map_op(*this, symbol); + } + + constexpr CUDF_HOST_DEVICE int32_t lookup(SymbolT const symbol) const { // Look up the symbol group for given symbol return temp_storage @@ -148,6 +179,95 @@ class SingleSymbolSmemLUT { } }; +/** + * @brief Creates a symbol group lookup table of type `SingleSymbolSmemLUT` that uses a two-staged + * lookup approach. @p pre_map_op is a function object invoked with `(lut, symbol)` that must return + * the symbol group id for the given `symbol`. `lut` is an instance of the lookup table + * and `symbol` is a symbol from the input tape. Usually, @p pre_map_op first maps a symbol from + * the input tape to an integral that is convertible to `symbol_t`. In a second stage, @p pre_map_op + * uses `lut`'s `lookup(mapped_symbol)` that maps that integral to the symbol group id. + * + * @tparam symbol_t Must be an integral type + * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka + * "other" symbol group) + * @tparam pre_map_op_t A unary function object type that returns the symbol group id + * @param symbol_strings An array of vectors, where all the symbols in the i-th vector are mapped to + * the i-th symbol group + * @param pre_map_op A unary function object type that returns the symbol group id for a symbol + * @return A symbol group lookup table + */ +template +auto make_symbol_group_lut( + std::array, NUM_SYMBOL_GROUPS> const& symbol_strings, + pre_map_op_t pre_map_op) +{ + using lookup_table_t = SingleSymbolSmemLUT; + return lookup_table_t::InitDeviceSymbolGroupIdLut(symbol_strings, pre_map_op); +} + +/** + * @brief Creates a symbol group lookup table of type `SingleSymbolSmemLUT` that uses a two-staged + * lookup approach. @p pre_map_op is a function object invoked with `(lut, symbol)` that must return + * the symbol group id for the given `symbol`. `lut` is an instance of the lookup table + * and `symbol` is a symbol from the input tape. Usually, @p pre_map_op first maps a symbol from + * the input tape to an integral that is convertible to `symbol_t`. In a second stage, @p pre_map_op + * uses `lut`'s `lookup(mapped_symbol)` that maps that integral to the symbol group id. + * + * @tparam symbol_t The type returned by @p pre_map_op must be assignable to `char` + * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka + * "other" symbol group) + * @tparam pre_map_op_t A unary function object type that returns the symbol group id for a symbol + * @param symbol_strings An array of strings, where all the characters in the i-th string are mapped + * to the i-th symbol group + * @param pre_map_op A unary function object type that returns the symbol group id for a symbol + * @return A symbol group lookup table + */ +template +auto make_symbol_group_lut(std::array const& symbol_strings, + pre_map_op_t pre_map_op) +{ + using symbol_t = char; + using lookup_table_t = SingleSymbolSmemLUT; + return lookup_table_t::InitDeviceSymbolGroupIdLut(symbol_strings, pre_map_op); +} + +/** + * @brief Creates a symbol group lookup table that maps a symbol to a symbol group id, requiring the + * symbol type from the input tape to be assignable to `symbol_t` and `symbol_t` to be of integral + * type. + * + * @tparam symbol_t The input tape's symbol type must be assignable to this type + * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka + * "other" symbol group) + * @param symbol_strings An array of vectors, where all the symbols in the i-th vector are mapped to + * the i-th symbol group + * @return A symbol group lookup table + */ +template +auto make_symbol_group_lut( + std::array, NUM_SYMBOL_GROUPS> const& symbol_strings) +{ + return make_symbol_group_lut(symbol_strings, IdentityOp{}); +} + +/** + * @brief Creates a symbol group lookup table that maps a symbol to a symbol group id, requiring the + * symbol type from the input tape to be assignable to `symbol_t` and `symbol_t` to be of integral + * type. + * + * @tparam symbol_t The input tape's symbol type must be assignable to this type + * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka + * "other" symbol group) + * @param symbol_strings An array of strings, where all the characters in the i-th string are mapped + * to the i-th symbol group + * @return A symbol group lookup table + */ +template +auto make_symbol_group_lut(std::array const& symbol_strings) +{ + return make_symbol_group_lut(symbol_strings, IdentityOp{}); +} + /** * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a new target state. The * class uses shared memory for the lookups. @@ -166,18 +286,20 @@ class TransitionTable { }; public: - using TempStorage = cub::Uninitialized<_TempStorage>; + static constexpr int32_t NUM_STATES = MAX_NUM_STATES; + using TempStorage = cub::Uninitialized<_TempStorage>; struct KernelParameter { + using LookupTableT = TransitionTable; + ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; }; template - static void InitDeviceTransitionTable( - cudf::detail::hostdevice_vector& transition_table_init, - std::array, MAX_NUM_STATES> const& translation_table, - rmm::cuda_stream_view stream) + static KernelParameter InitDeviceTransitionTable( + std::array, MAX_NUM_STATES> const& translation_table) { + KernelParameter init_data{}; // translation_table[state][symbol] -> new state for (std::size_t state = 0; state < translation_table.size(); ++state) { for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) { @@ -185,13 +307,12 @@ class TransitionTable { static_cast(translation_table[state][symbol]) <= std::numeric_limits::max(), "Target state index value exceeds value representable by the transition table's type"); - transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = + init_data.transitions[symbol * MAX_NUM_STATES + state] = static_cast(translation_table[state][symbol]); } } - // Copy transition table to device - transition_table_init.host_to_device_async(stream); + return init_data; } constexpr CUDF_HOST_DEVICE TransitionTable(KernelParameter const& kernel_param, @@ -235,24 +356,83 @@ class TransitionTable { } }; +/** + * @brief Creates a transition table of type `TransitionTable` that maps `(state_id, match_id)` + * pairs to the new target state for the given `(state_id, match_id)`-combination. + * + * @tparam StateIdT An integral type used to represent state indexes + * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + * @param transition_table The transition table + * @return A transition table of type `TransitionTable` + */ +template +auto make_transition_table( + std::array, MAX_NUM_STATES> const& transition_table) +{ + using transition_table_t = TransitionTable; + return transition_table_t::InitDeviceTransitionTable(transition_table); +} + +/** + * @brief Compile-time reflection to check if `OpT` type has the `TempStorage` and + * `KernelParameter` type members. + */ +template +struct is_complex_op : std::false_type {}; + +template +struct is_complex_op> + : std::true_type {}; + +/** + * @brief The device view that is passed to the finite-state transducer algorithm. Each of the + * lookup tables can either be a simple function object that defines the `operator()` required for + * respective lookup table or a complex class. + * + * @tparam SymbolGroupIdLookupT + * @tparam TransitionTableT + * @tparam TranslationTableT + * @tparam NUM_STATES + */ template class dfa_device_view { private: - using sgid_lut_init_t = typename SymbolGroupIdLookupT::KernelParameter; - using transition_table_init_t = typename TransitionTableT::KernelParameter; - using translation_table_init_t = typename TranslationTableT::KernelParameter; + // Complex symbol group lookup operators need to declare a `TempStorage` and `KernelParameter` + // type member that is passed during device-side initialization. + using sgid_lut_init_t = std::conditional_t::value, + typename SymbolGroupIdLookupT::KernelParameter, + SymbolGroupIdLookupT>; + + // Complex transition table lookup operators need to declare a `TempStorage` and + // `KernelParameter` type member that is passed during device-side initialization. + using transition_table_init_t = std::conditional_t::value, + typename TransitionTableT::KernelParameter, + TransitionTableT>; + + // Complex translation table lookup operators need to declare a `TempStorage` and + // `KernelParameter` type member that is passed during device-side initialization. + using translation_table_init_t = std::conditional_t::value, + typename TranslationTableT::KernelParameter, + TranslationTableT>; public: // The maximum number of states supported by this DFA instance // This is a value queried by the DFA simulation algorithm static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - using SymbolGroupStorageT = typename SymbolGroupIdLookupT::TempStorage; - using TransitionTableStorageT = typename TransitionTableT::TempStorage; - using TranslationTableStorageT = typename TranslationTableT::TempStorage; + using SymbolGroupStorageT = std::conditional_t::value, + typename SymbolGroupIdLookupT::TempStorage, + typename cub::NullType>; + using TransitionTableStorageT = std::conditional_t::value, + typename TransitionTableT::TempStorage, + typename cub::NullType>; + using TranslationTableStorageT = std::conditional_t::value, + typename TranslationTableT::TempStorage, + typename cub::NullType>; __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage) { @@ -286,14 +466,16 @@ class dfa_device_view { /** * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols - * that the finite-state transducer is supposed to output for each transition. The class uses shared - * memory for the lookups. + * that the finite-state transducer is supposed to output for each transition. The class uses + * shared memory for the lookups. * * @tparam OutSymbolT The symbol type being output - * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols + * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output + * symbols * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols + * be used. */ template ; struct KernelParameter { + using LookupTableT = TransducerLookupTable; + OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; OutSymbolT d_out_symbols[MAX_TABLE_SIZE]; }; @@ -321,12 +509,11 @@ class TransducerLookupTable { * @note Synchronizes the thread block, if called from device, and, hence, requires all threads * of the thread block to call the constructor */ - static void InitDeviceTranslationTable( - cudf::detail::hostdevice_vector& translation_table_init, + static KernelParameter InitDeviceTranslationTable( std::array, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& - translation_table, - rmm::cuda_stream_view stream) + translation_table) { + KernelParameter init_data; std::vector out_symbols; out_symbols.reserve(MAX_TABLE_SIZE); std::vector out_symbol_offsets; @@ -357,15 +544,11 @@ class TransducerLookupTable { CUDF_EXPECTS(out_symbols.size() <= MAX_TABLE_SIZE, "Unsupported translation table"); // Prepare host-side data to be copied and passed to the device - std::copy(std::cbegin(out_symbol_offsets), - std::cend(out_symbol_offsets), - translation_table_init.host_ptr()->d_out_offsets); - std::copy(std::cbegin(out_symbols), - std::cend(out_symbols), - translation_table_init.host_ptr()->d_out_symbols); - - // Copy data to device - translation_table_init.host_to_device_async(stream); + std::copy( + std::cbegin(out_symbol_offsets), std::cend(out_symbol_offsets), init_data.d_out_offsets); + std::copy(std::cbegin(out_symbols), std::cend(out_symbols), init_data.d_out_symbols); + + return init_data; } private: @@ -408,24 +591,130 @@ class TransducerLookupTable { #endif } - template - constexpr CUDF_HOST_DEVICE OutSymbolT operator()(StateIndexT const state_id, - SymbolIndexT const match_id, - RelativeOffsetT const relative_offset) const + template + constexpr CUDF_HOST_DEVICE auto operator()(StateIndexT const state_id, + SymbolIndexT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const /*read_symbol*/) const { auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset; return temp_storage.out_symbols[offset]; } - template + template constexpr CUDF_HOST_DEVICE OutSymbolOffsetT operator()(StateIndexT const state_id, - SymbolIndexT const match_id) const + SymbolIndexT const match_id, + SymbolT const /*read_symbol*/) const { return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] - temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id]; } }; +/** + * @brief Creates a translation table that maps (old_state, symbol_group_id) transitions to a + * sequence of symbols that the finite-state transducer is supposed to output for each transition. + * + * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols + * be used + * @tparam OutSymbolT The symbol type being output + * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + * @param translation_table The translation table + * @return A translation table of type `TransducerLookupTable`. + */ +template +auto make_translation_table(std::array, MAX_NUM_SYMBOLS>, + MAX_NUM_STATES> const& translation_table) +{ + using OutSymbolOffsetT = int32_t; + using translation_table_t = TransducerLookupTable; + return translation_table_t::InitDeviceTranslationTable(translation_table); +} + +template +class TranslationOp { + private: + struct _TempStorage {}; + + public: + using TempStorage = cub::Uninitialized<_TempStorage>; + + struct KernelParameter { + using LookupTableT = TranslationOp; + TranslationOpT translation_op; + }; + + /** + * @brief Initializes the lookup table, primarily to be invoked from within device code but also + * provides host-side implementation for verification. + * @note Synchronizes the thread block, if called from device, and, hence, requires all threads + * of the thread block to call the constructor + */ + static KernelParameter InitDeviceTranslationTable(TranslationOpT translation_op) + { + return KernelParameter{translation_op}; + } + + private: + _TempStorage& temp_storage; + TranslationOpT translation_op; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + public: + CUDF_HOST_DEVICE TranslationOp(KernelParameter const& kernel_param, TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()), translation_op(kernel_param.translation_op) + { + } + + template + constexpr CUDF_HOST_DEVICE auto operator()(StateIndexT const state_id, + SymbolIndexT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + return translation_op(*this, state_id, match_id, relative_offset, read_symbol); + } + + template + constexpr CUDF_HOST_DEVICE auto operator()(StateIndexT const state_id, + SymbolIndexT const match_id, + SymbolT const read_symbol) const + { + return translation_op(*this, state_id, match_id, read_symbol); + } +}; + +/** + * @brief Creates a simple translation table that uses a simple function object to retrieve the + * + * @tparam FunctorT A function object type that must implement two signatures: (1) with `(state_id, + * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)` + * @param map_op A function object that must implement two signatures: (1) with `(state_id, + * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`. + * Invocations of the first signature, (1), must return the number of symbols that are emitted for + * the given transition. The second signature, (2), must return the i-th symbol to be emitted for + * that transition, where `i` corresponds to `relative_offse` + * @return A translation table of type `TranslationO` + */ +template +auto make_translation_functor(FunctorT map_op) +{ + return TranslationOp::InitDeviceTranslationTable(map_op); +} + /** * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the * transition table and its number of states, the mapping of symbols to symbol groups, and the @@ -437,70 +726,32 @@ class TransducerLookupTable { * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the * transition table) */ -template +template class Dfa { - public: - // The maximum number of states supported by this DFA instance - // This is a value queried by the DFA simulation algorithm - static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - - private: - // Symbol-group id lookup table - using SymbolGroupIdLookupT = detail::SingleSymbolSmemLUT; - using SymbolGroupIdInitT = typename SymbolGroupIdLookupT::KernelParameter; - - // Transition table - using TransitionTableT = detail::TransitionTable; - using TransitionTableInitT = typename TransitionTableT::KernelParameter; - - // Translation lookup table - using OutSymbolOffsetT = uint32_t; - using TranslationTableT = detail::TransducerLookupTable; - using TranslationTableInitT = typename TranslationTableT::KernelParameter; + static constexpr int32_t single_item = 1; + public: auto get_device_view() { - return dfa_device_view{ - sgid_init.d_begin(), transition_table_init.d_begin(), translation_table_init.d_begin()}; + return dfa_device_view{ + &init_data.d_begin()->sgid_lut_init, + &init_data.d_begin()->transition_table_init, + &init_data.d_begin()->translation_table_init}; } - public: - /** - * @brief Constructs a new DFA. - * - * @param symbol_vec Sequence container of symbol groups. Each symbol group is a sequence - * container to symbols within that group. The index of the symbol group containing a symbol being - * read will be used as symbol_gid of the transition and translation tables. - * @param tt_vec The transition table - * @param out_tt_vec The translation table - * @param stream The stream to which memory operations and kernels are getting dispatched to - */ - template - Dfa(SymbolGroupIdItT const& symbol_vec, - std::array, NUM_STATES> const& tt_vec, - std::array, NUM_SYMBOLS>, NUM_STATES> const& out_tt_vec, - cudaStream_t stream) + Dfa(SymbolGroupIdInitT const& sgid_lut_init, + TransitionTableInitT const& transition_table_init, + TranslationTableInitT const& translation_table_init, + rmm::cuda_stream_view stream) + : init_data{single_item, stream} { - constexpr std::size_t single_item = 1; - - sgid_init = cudf::detail::hostdevice_vector{single_item, stream}; - transition_table_init = - cudf::detail::hostdevice_vector{single_item, stream}; - translation_table_init = - cudf::detail::hostdevice_vector{single_item, stream}; - - // Initialize symbol group id lookup table - SymbolGroupIdLookupT::InitDeviceSymbolGroupIdLut(sgid_init, symbol_vec, stream); - - // Initialize state transition table - TransitionTableT::InitDeviceTransitionTable(transition_table_init, tt_vec, stream); - - // Initialize finite-state transducer lookup table - TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream); + *init_data.host_ptr() = {sgid_lut_init, transition_table_init, translation_table_init}; + init_data.host_to_device_async(stream); } /** @@ -513,8 +764,8 @@ class Dfa { * indexes are written. * @tparam TransducedCountOutItT A single-item output iterator type to which the total number of * output symbols is written - * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and (b) - * the output symbols + * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and + * (b) the output symbols * @param d_chars Pointer to the input string of symbols * @param num_chars The total number of input symbols to process * @param d_out_it Random-access output iterator to which the transduced output is @@ -527,12 +778,12 @@ class Dfa { * "end-state" of the previous invocation of the algorithm. * @param stream CUDA stream to launch kernels within. Default is the null-stream. */ - template - void Transduce(SymbolT const* d_chars, + void Transduce(SymbolItT d_chars_it, OffsetT num_chars, TransducedOutItT d_out_it, TransducedIndexOutItT d_out_idx_it, @@ -545,7 +796,7 @@ class Dfa { DeviceTransduce(nullptr, temp_storage_bytes, this->get_device_view(), - d_chars, + d_chars_it, num_chars, d_out_it, d_out_idx_it, @@ -560,7 +811,7 @@ class Dfa { DeviceTransduce(temp_storage.data(), temp_storage_bytes, this->get_device_view(), - d_chars, + d_chars_it, num_chars, d_out_it, d_out_idx_it, @@ -570,9 +821,36 @@ class Dfa { } private: - cudf::detail::hostdevice_vector sgid_init{}; - cudf::detail::hostdevice_vector transition_table_init{}; - cudf::detail::hostdevice_vector translation_table_init{}; + struct host_device_data { + SymbolGroupIdInitT sgid_lut_init; + TransitionTableInitT transition_table_init; + TranslationTableInitT translation_table_init; + }; + cudf::detail::hostdevice_vector init_data{}; }; +/** + * @brief Creates a determninistic finite automaton (DFA) as specified by the triple of (symbol + * group, transition, translation)-lookup tables to be used with the finite-state transducer + * algorithm. + * + * @param sgid_lut_init Object used to initialize the symbol group lookup table + * @param transition_table_init Object used to initialize the transition table + * @param translation_table_init Object used to initialize the translation table + * @param stream The stream used to allocate and initialize device-side memory that is used to + * initialize the lookup tables + * @return A DFA of type `Dfa`. + */ +template +auto make_fst(SymbolGroupIdInitT const& sgid_lut_init, + TransitionTableInitT const& transition_table_init, + TranslationTableInitT const& translation_table_init, + rmm::cuda_stream_view stream) +{ + return Dfa( + sgid_lut_init, transition_table_init, translation_table_init, stream); +} + } // namespace cudf::io::fst::detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 1c7d5b11032..3bbfc4b5f83 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -44,6 +44,21 @@ struct tree_meta_t { */ enum class json_col_t : char { ListColumn, StructColumn, StringColumn, Unknown }; +/** + * @brief Enum class to specify whether we just push onto and pop from the stack or whether we also + * reset to an empty stack on a newline character. + */ +enum class stack_behavior_t : char { + /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop + /// from the stack + PushPopWithoutReset, + + /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop + /// from the stack. Newline characters are considered delimiters and therefore reset to an empty + /// stack. + ResetOnDelimiter +}; + // Default name for a list's child column constexpr auto list_child_name{"element"}; @@ -175,12 +190,28 @@ namespace detail { * character of \p d_json_in, where a '{' represents that the corresponding input character is * within the context of a struct, a '[' represents that it is within the context of an array, and a * '_' symbol that it is at the root of the JSON. + * @param[in] stack_behavior Specifies the stack's behavior * @param[in] stream The cuda stream to dispatch GPU kernels to */ void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, + stack_behavior_t stack_behavior, rmm::cuda_stream_view stream); +/** + * @brief Post-processes a token stream that may contain tokens from invalid lines. Expects that the + * token stream begins with a LineEnd token. + * + * @param tokens The tokens to be post-processed + * @param token_indices The tokens' corresponding indices that are post-processed + * @param stream The cuda stream to dispatch GPU kernels to + * @return Returns the post-processed token stream + */ +std::pair, rmm::device_uvector> process_token_stream( + device_span tokens, + device_span token_indices, + rmm::cuda_stream_view stream); + /** * @brief Parses the given JSON string and generates a tree representation of the given input. * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 26dffd3328a..3b6c2b18250 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -39,8 +40,11 @@ #include #include +#include #include +#include #include +#include #include #include @@ -88,6 +92,115 @@ void check_input_size(std::size_t input_size) namespace cudf::io::json { +// FST to prune tokens of invalid lines for recovering JSON lines format +namespace token_filter { + +// Type used to represent the target state in the transition table +using StateT = char; + +// Type used to represent a symbol group id +using SymbolGroupT = uint8_t; + +/** + * @brief Definition of the DFA's states + */ +enum class dfa_states : StateT { VALID, INVALID, NUM_STATES }; + +// Aliases for readability of the transition table +constexpr auto TT_INV = dfa_states::INVALID; +constexpr auto TT_VLD = dfa_states::VALID; + +/** + * @brief Definition of the symbol groups + */ +enum class dfa_symbol_group_id : SymbolGroupT { + ERROR, ///< Error token symbol group + DELIMITER, ///< Record / line delimiter symbol group + OTHER_SYMBOLS, ///< Symbol group that implicitly matches all other tokens + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; + +constexpr auto TT_NUM_STATES = static_cast(dfa_states::NUM_STATES); +constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); + +// Lookup table to map an input symbol (i.e., a token) to a symbol group +std::array, NUM_SYMBOL_GROUPS - 1> const symbol_groups{{ + {static_cast(token_t::ErrorBegin)}, // Symbols mapping to ERROR + {static_cast(token_t::LineEnd)} // Symbols mapping to DELIMITER +}}; + +/** + * @brief Function object to map (token,token_index) tuples to a symbol group. + */ +struct UnwrapTokenFromSymbolOp { + template + CUDF_HOST_DEVICE SymbolGroupT operator()(SymbolGroupLookupTableT const& sgid_lut, + thrust::tuple symbol) const + { + PdaTokenT const token_type = thrust::get<0>(symbol); + return sgid_lut.lookup(token_type); + } +}; + +/** + * @brief Translation function object that discards line delimiter tokens and tokens belonging to + * invalid lines. + */ +struct TransduceToken { + template + constexpr CUDF_HOST_DEVICE SymbolT operator()(TransducerTableT const&, + StateT const state_id, + SymbolGroupT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + const bool is_end_of_invalid_line = + (state_id == static_cast(TT_INV) && + match_id == static_cast(dfa_symbol_group_id::DELIMITER)); + + if (is_end_of_invalid_line) { + return relative_offset == 0 ? SymbolT{token_t::StructEnd, 0} + : SymbolT{token_t::StructBegin, 0}; + } else { + return read_symbol; + } + } + + template + constexpr CUDF_HOST_DEVICE int32_t operator()(TransducerTableT const&, + StateT const state_id, + SymbolGroupT const match_id, + SymbolT const read_symbol) const + { + // Number of tokens emitted on invalid lines + constexpr int32_t num_inv_tokens = 2; + + const bool is_delimiter = match_id == static_cast(dfa_symbol_group_id::DELIMITER); + + // If state is either invalid or we're entering an invalid state, we discard tokens + const bool is_part_of_invalid_line = + (match_id != static_cast(dfa_symbol_group_id::ERROR) && + state_id == static_cast(TT_VLD)); + + // Indicates whether we transition from an invalid line to a potentially valid line + const bool is_end_of_invalid_line = (state_id == static_cast(TT_INV) && is_delimiter); + + int32_t const emit_count = + is_end_of_invalid_line ? num_inv_tokens : (is_part_of_invalid_line && !is_delimiter ? 1 : 0); + return emit_count; + } +}; + +// Transition table +std::array, TT_NUM_STATES> const transition_table{ + {/* IN_STATE ERROR DELIM OTHER */ + /* VALID */ {{TT_INV, TT_VLD, TT_VLD}}, + /* INVALID */ {{TT_INV, TT_VLD, TT_INV}}}}; + +// The DFA's starting state +constexpr auto start_state = static_cast(TT_VLD); +} // namespace token_filter + // JSON to stack operator DFA (Deterministic Finite Automata) namespace to_stack_op { @@ -129,6 +242,7 @@ enum class dfa_symbol_group_id : uint8_t { CLOSING_BRACKET, ///< Closing bracket SG: ] QUOTE_CHAR, ///< Quote character SG: " ESCAPE_CHAR, ///< Escape character SG: '\' + NEWLINE_CHAR, ///< Newline character SG: '\n' OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; @@ -138,21 +252,29 @@ constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NU // The i-th string representing all the characters of a symbol group std::array const symbol_groups{ - {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}}}; + {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}, {"\n"}}}; // Transition table std::array, TT_NUM_STATES> const transition_table{ - {/* IN_STATE { [ } ] " \ OTHER */ - /* TT_OOS */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}}, - /* TT_STR */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR}}, - /* TT_ESC */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}}; + {/* IN_STATE { [ } ] " \ \n OTHER */ + /* TT_OOS */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_STR */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}}, + /* TT_ESC */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}}; // Translation table (i.e., for each transition, what are the symbols that we output) std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{ - {/* IN_STATE { [ } ] " \ OTHER */ - /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}}}, - /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}}}, - /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}}}}}; + {/* IN_STATE { [ } ] " \ \n OTHER */ + /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}}, + /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}, {}}}, + /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}}; + +// Translation table +std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const + resetting_translation_table{ + {/* IN_STATE { [ } ] " \ \n OTHER */ + /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}}, + /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}, {}}}, + /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}}; // The DFA's starting state constexpr auto start_state = static_cast(TT_OOS); @@ -409,6 +531,27 @@ enum class pda_state_t : StateT { PD_NUM_STATES }; +enum class json_format_cfg_t { + // Format describing regular JSON + JSON, + + // Format describing permissive newline-delimited JSON + // I.e., newline characters are only treteated as delimiters at the root stack level + // E.g., this is treated as a single record: + // {"a": + // 123} + JSON_LINES, + + // Format describing strict newline-delimited JSON + // I.e., All newlines are delimiting a record, independent of the context they appear in + JSON_LINES_STRICT, + + // Transition table for parsing newline-delimited JSON that recovers from invalid JSON lines + // This format also follows `JSON_LINES_STRICT` behaviour + JSON_LINES_RECOVER + +}; + // Aliases for readability of the transition table constexpr auto PD_BOV = pda_state_t::PD_BOV; constexpr auto PD_BOA = pda_state_t::PD_BOA; @@ -430,68 +573,133 @@ constexpr auto start_state = static_cast(pda_state_t::PD_BOV); /** * @brief Getting the transition table */ -auto get_transition_table(bool newline_delimited_json) +auto get_transition_table(json_format_cfg_t format) { static_assert(static_cast(stack_symbol_group_id::STACK_ROOT) == 0); static_assert(static_cast(stack_symbol_group_id::STACK_LIST) == 1); static_assert(static_cast(stack_symbol_group_id::STACK_STRUCT) == 2); - // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace. - // Thas is, empty lines are ignored - auto const PD_ANL = newline_delimited_json ? PD_BOV : PD_PVL; std::array, PD_NUM_STATES> pda_tt; - // { [ } ] " \ , : space newline other - pda_tt[static_cast(pda_state_t::PD_BOV)] = { - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, - PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON}; - pda_tt[static_cast(pda_state_t::PD_BOA)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_LON, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_ERR}; - pda_tt[static_cast(pda_state_t::PD_LON)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_PVL, PD_LON, - PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_LON, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_LON}; - pda_tt[static_cast(pda_state_t::PD_STR)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; - pda_tt[static_cast(pda_state_t::PD_SCE)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; - pda_tt[static_cast(pda_state_t::PD_PVL)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ANL, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_ERR, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_ERR}; - pda_tt[static_cast(pda_state_t::PD_BFN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BFN, PD_ERR}; - pda_tt[static_cast(pda_state_t::PD_FLN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; - pda_tt[static_cast(pda_state_t::PD_FNE)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; - pda_tt[static_cast(pda_state_t::PD_PFN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_PFN, PD_ERR}; - pda_tt[static_cast(pda_state_t::PD_ERR)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; + + if (format == json_format_cfg_t::JSON || format == json_format_cfg_t::JSON_LINES) { + // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace. + // Thas is, empty lines are ignored + // PD_ANL describes the target state after a new line on an empty stack (JSON root level) + auto const PD_ANL = (format == json_format_cfg_t::JSON) ? PD_PVL : PD_BOV; + + // First row: empty stack ("root" level of the JSON) + // Second row: '[' on top of stack (we're parsing a list value) + // Third row: '{' on top of stack (we're parsing a struct value) + // { [ } ] " \ , : space newline other + pda_tt[static_cast(pda_state_t::PD_BOV)] = { + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON}; + pda_tt[static_cast(pda_state_t::PD_BOA)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_LON)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_LON}; + pda_tt[static_cast(pda_state_t::PD_STR)] = { + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[static_cast(pda_state_t::PD_SCE)] = { + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[static_cast(pda_state_t::PD_PVL)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ANL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_BFN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BFN, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_FLN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[static_cast(pda_state_t::PD_FNE)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[static_cast(pda_state_t::PD_PFN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_PFN, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_ERR)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; + } + // Transition table for strict JSON lines (including recovery) + // Newlines are treated as record delimiters + else { + // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace. + // Thas is, empty lines are ignored + // PD_ANL describes the target state after a new line after encountering error state + auto const PD_ANL = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_BOV : PD_ERR; + + // First row: empty stack ("root" level of the JSON) + // Second row: '[' on top of stack (we're parsing a list value) + // Third row: '{' on top of stack (we're parsing a struct value) + // { [ } ] " \ , : space newline other + pda_tt[static_cast(pda_state_t::PD_BOV)] = { + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON}; + pda_tt[static_cast(pda_state_t::PD_BOA)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOV, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOV, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_LON)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_LON, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_LON}; + pda_tt[static_cast(pda_state_t::PD_STR)] = { + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[static_cast(pda_state_t::PD_SCE)] = { + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[static_cast(pda_state_t::PD_PVL)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_BFN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BOV, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_FLN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN}; + pda_tt[static_cast(pda_state_t::PD_FNE)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[static_cast(pda_state_t::PD_PFN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_BOV, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_ERR)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ANL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ANL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ANL, PD_ERR}; + } return pda_tt; } /** * @brief Getting the translation table */ -auto get_translation_table() +auto get_translation_table(bool include_line_delimiter) { constexpr auto StructBegin = token_t::StructBegin; constexpr auto StructEnd = token_t::StructEnd; @@ -507,6 +715,15 @@ auto get_translation_table() constexpr auto ValueEnd = token_t::ValueEnd; constexpr auto ErrorBegin = token_t::ErrorBegin; + /** + * @brief Appends token_t::LineEnd token to the given token sequence, if and only if + * `include_line_delimiter` is true. + */ + auto nl_tokens = [include_line_delimiter](std::vector tokens) { + if (include_line_delimiter) { tokens.push_back(token_t::LineEnd); } + return tokens; + }; + std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ {StructBegin}, // OPENING_BRACE @@ -518,7 +735,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ValueBegin}, // OTHER /*LIST*/ {StructBegin}, // OPENING_BRACE @@ -530,7 +747,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ValueBegin}, // OTHER /*STRUCT*/ {StructBegin}, // OPENING_BRACE @@ -542,7 +759,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ValueBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_BOA)] = { { /*ROOT*/ @@ -555,7 +772,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK + nl_tokens({ErrorBegin}), // LINE_BREAK {ErrorBegin}, // OTHER /*LIST*/ {StructBegin}, // OPENING_BRACE @@ -567,7 +784,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ValueBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE @@ -579,7 +796,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_LON)] = { { /*ROOT*/ @@ -592,132 +809,132 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {ValueEnd}, // WHITE_SPACE - {ValueEnd}, // LINE_BREAK + nl_tokens({ValueEnd}), // LINE_BREAK {}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ValueEnd, ListEnd}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ValueEnd}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - {ValueEnd}, // LINE_BREAK - {}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ValueEnd, ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + nl_tokens({ValueEnd}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ValueEnd, StructMemberEnd, StructEnd}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ValueEnd, StructMemberEnd}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - {ValueEnd}, // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ValueEnd, StructMemberEnd, StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd, StructMemberEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + nl_tokens({ValueEnd}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_PVL)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ListEnd}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -728,145 +945,145 @@ auto get_translation_table() {StructMemberEnd}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_BFN)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {StructEnd}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {StructMemberBegin, FieldNameBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StructMemberBegin, FieldNameBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {FieldNameEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {FieldNameEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -877,7 +1094,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ @@ -890,32 +1107,32 @@ auto get_translation_table() {}, // COMMA {}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER return pda_tlt; } @@ -929,9 +1146,32 @@ struct JSONToStackOp { template constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const { - return (stack_symbol == '{' || stack_symbol == '[') ? fst::stack_op_type::PUSH - : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP - : fst::stack_op_type::READ; + switch (stack_symbol) { + case '{': + case '[': return fst::stack_op_type::PUSH; + case '}': + case ']': return fst::stack_op_type::POP; + default: return fst::stack_op_type::READ; + } + } +}; + +/** + * @brief Function object used to filter for brackets and braces that represent push and pop + * operations + */ +struct JSONWithRecoveryToStackOp { + template + constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const + { + switch (stack_symbol) { + case '{': + case '[': return fst::stack_op_type::PUSH; + case '}': + case ']': return fst::stack_op_type::POP; + case '\n': return fst::stack_op_type::RESET; + default: return fst::stack_op_type::READ; + } } }; @@ -1030,6 +1270,7 @@ namespace detail { void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, + stack_behavior_t stack_behavior, rmm::cuda_stream_view stream) { check_input_size(json_in.size()); @@ -1052,15 +1293,19 @@ void get_stack_context(device_span json_in, rmm::device_uvector stack_op_indices{json_in.size(), stream}; // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes - using ToStackOpFstT = - cudf::io::fst::detail::Dfa( - to_stack_op::dfa_symbol_group_id::NUM_SYMBOL_GROUPS), - static_cast(to_stack_op::dfa_states::TT_NUM_STATES)>; - ToStackOpFstT json_to_stack_ops_fst{to_stack_op::symbol_groups, - to_stack_op::transition_table, - to_stack_op::translation_table, - stream}; + constexpr auto max_translation_table_size = + to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES; + + // Translation table specialized on the choice of whether to reset on newlines outside of strings + const auto translation_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter) + ? to_stack_op::resetting_translation_table + : to_stack_op::translation_table; + + auto json_to_stack_ops_fst = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(to_stack_op::symbol_groups), + fst::detail::make_transition_table(to_stack_op::transition_table), + fst::detail::make_translation_table(translation_table), + stream); // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end // of structs/lists @@ -1075,16 +1320,80 @@ void get_stack_context(device_span json_in, // Copy back to actual number of stack operations auto const num_stack_ops = d_num_stack_ops.value(stream); - // stack operations with indices are converted to top of the stack for each character in the input - fst::sparse_stack_op_to_top_of_stack( - stack_ops.data(), - device_span{stack_op_indices.data(), num_stack_ops}, - JSONToStackOp{}, - d_top_of_stack, - root_symbol, - read_symbol, - json_in.size(), + // Stack operations with indices are converted to top of the stack for each character in the input + if (stack_behavior == stack_behavior_t::ResetOnDelimiter) { + fst::sparse_stack_op_to_top_of_stack( + stack_ops.data(), + device_span{stack_op_indices.data(), num_stack_ops}, + JSONWithRecoveryToStackOp{}, + d_top_of_stack, + root_symbol, + read_symbol, + json_in.size(), + stream); + } else { + fst::sparse_stack_op_to_top_of_stack( + stack_ops.data(), + device_span{stack_op_indices.data(), num_stack_ops}, + JSONToStackOp{}, + d_top_of_stack, + root_symbol, + read_symbol, + json_in.size(), + stream); + } +} + +std::pair, rmm::device_uvector> process_token_stream( + device_span tokens, + device_span token_indices, + rmm::cuda_stream_view stream) +{ + // Instantiate FST for post-processing the token stream to remove all tokens that belong to an + // invalid JSON line + token_filter::UnwrapTokenFromSymbolOp sgid_op{}; + auto filter_fst = + fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), + fst::detail::make_transition_table(token_filter::transition_table), + fst::detail::make_translation_functor(token_filter::TransduceToken{}), + stream); + + auto const mr = rmm::mr::get_current_device_resource(); + rmm::device_scalar d_num_selected_tokens(stream, mr); + rmm::device_uvector filtered_tokens_out{tokens.size(), stream, mr}; + rmm::device_uvector filtered_token_indices_out{tokens.size(), stream, mr}; + + // The FST is run on the reverse token stream, discarding all tokens between ErrorBegin and the + // next LineEnd (LineEnd, inv_token_0, inv_token_1, ..., inv_token_n, ErrorBegin, LineEnd, ...), + // emitting a [StructBegin, StructEnd] pair on the end of such an invalid line. In that example, + // inv_token_i for i in [0, n] together with the ErrorBegin are removed and replaced with + // StructBegin, StructEnd. Also, all LineEnd are removed as well, as these are not relevant after + // this stage anymore + filter_fst.Transduce( + thrust::make_reverse_iterator(thrust::make_zip_iterator(tokens.data(), token_indices.data()) + + tokens.size()), + static_cast(tokens.size()), + thrust::make_reverse_iterator( + thrust::make_zip_iterator(filtered_tokens_out.data(), filtered_token_indices_out.data()) + + tokens.size()), + thrust::make_discard_iterator(), + d_num_selected_tokens.data(), + token_filter::start_state, stream); + + auto const num_total_tokens = d_num_selected_tokens.value(stream); + rmm::device_uvector tokens_out{num_total_tokens, stream, mr}; + rmm::device_uvector token_indices_out{num_total_tokens, stream, mr}; + thrust::copy(rmm::exec_policy(stream), + filtered_tokens_out.end() - num_total_tokens, + filtered_tokens_out.end(), + tokens_out.data()); + thrust::copy(rmm::exec_policy(stream), + filtered_token_indices_out.end() - num_total_tokens, + filtered_token_indices_out.end(), + token_indices_out.data()); + + return std::make_pair(std::move(tokens_out), std::move(token_indices_out)); } std::pair, rmm::device_uvector> get_token_stream( @@ -1100,13 +1409,25 @@ std::pair, rmm::device_uvector> ge auto const new_line_delimited_json = options.is_enabled_lines(); + // (!new_line_delimited_json) => JSON + // (new_line_delimited_json and recover_from_error) => JSON_LINES_RECOVER + // (new_line_delimited_json and !recover_from_error) => JSON_LINES + auto format = new_line_delimited_json + ? (options.recovery_mode() == json_recovery_mode_t::RECOVER_WITH_NULL + ? tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER + : tokenizer_pda::json_format_cfg_t::JSON_LINES) + : tokenizer_pda::json_format_cfg_t::JSON; + // Prepare for PDA transducer pass, merging input symbols with stack symbols - rmm::device_uvector pda_sgids = [json_in, stream]() { + auto const recover_from_error = (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER); + rmm::device_uvector pda_sgids = [json_in, stream, recover_from_error]() { // Memory holding the top-of-stack stack context for the input rmm::device_uvector stack_op_indices{json_in.size(), stream}; // Identify what is the stack context for each input character (JSON-root, struct, or list) - get_stack_context(json_in, stack_op_indices.data(), stream); + auto const stack_behavior = recover_from_error ? stack_behavior_t::ResetOnDelimiter + : stack_behavior_t::PushPopWithoutReset; + get_stack_context(json_in, stack_op_indices.data(), stack_behavior, stream); rmm::device_uvector pda_sgids{json_in.size(), stream}; auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data()); @@ -1118,22 +1439,21 @@ std::pair, rmm::device_uvector> ge return pda_sgids; }(); - // PDA transducer alias - using ToTokenStreamFstT = - cudf::io::fst::detail::Dfa( - tokenizer_pda::pda_state_t::PD_NUM_STATES)>; - // Instantiating PDA transducer - std::vector> pda_sgid_identity{tokenizer_pda::NUM_PDA_SGIDS}; + std::array, tokenizer_pda::NUM_PDA_SGIDS> pda_sgid_identity{}; std::generate(std::begin(pda_sgid_identity), std::end(pda_sgid_identity), [i = char{0}]() mutable { return std::vector{i++}; }); - ToTokenStreamFstT json_to_tokens_fst{pda_sgid_identity, - tokenizer_pda::get_transition_table(new_line_delimited_json), - tokenizer_pda::get_translation_table(), - stream}; + + constexpr auto max_translation_table_size = + tokenizer_pda::NUM_PDA_SGIDS * + static_cast(tokenizer_pda::pda_state_t::PD_NUM_STATES); + auto json_to_tokens_fst = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(pda_sgid_identity), + fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)), + fst::detail::make_translation_table( + tokenizer_pda::get_translation_table(recover_from_error)), + stream); // Perform a PDA-transducer pass // Compute the maximum amount of tokens that can possibly be emitted for a given input size @@ -1145,21 +1465,34 @@ std::pair, rmm::device_uvector> ge auto const max_token_out_count = cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct; rmm::device_scalar num_written_tokens{stream}; - rmm::device_uvector tokens{max_token_out_count, stream, mr}; - rmm::device_uvector tokens_indices{max_token_out_count, stream, mr}; + // In case we're recovering on invalid JSON lines, post-processing the token stream requires to + // see a JSON-line delimiter as the very first item + SymbolOffsetT const delimiter_offset = + (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER ? 1 : 0); + rmm::device_uvector tokens{max_token_out_count + delimiter_offset, stream, mr}; + rmm::device_uvector tokens_indices{ + max_token_out_count + delimiter_offset, stream, mr}; json_to_tokens_fst.Transduce(pda_sgids.begin(), static_cast(json_in.size()), - tokens.data(), - tokens_indices.data(), + tokens.data() + delimiter_offset, + tokens_indices.data() + delimiter_offset, num_written_tokens.data(), tokenizer_pda::start_state, stream); - auto const num_total_tokens = num_written_tokens.value(stream); + auto const num_total_tokens = num_written_tokens.value(stream) + delimiter_offset; tokens.resize(num_total_tokens, stream); tokens_indices.resize(num_total_tokens, stream); + if (delimiter_offset == 1) { + tokens.set_element(0, token_t::LineEnd, stream); + auto [filtered_tokens, filtered_tokens_indices] = + process_token_stream(tokens, tokens_indices, stream); + tokens = std::move(filtered_tokens); + tokens_indices = std::move(filtered_tokens_indices); + } + CUDF_EXPECTS(num_total_tokens <= max_token_out_count, "Generated token count exceeds the expected token count"); @@ -1281,6 +1614,7 @@ void make_json_column(json_column& root_column, case token_t::ValueBegin: return "ValueBegin"; case token_t::ValueEnd: return "ValueEnd"; case token_t::ErrorBegin: return "ErrorBegin"; + case token_t::LineEnd: return "LineEnd"; default: return "Unknown"; } }; diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 1970b29fee9..fd69251e4f5 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -129,9 +129,6 @@ TEST_F(FstTest, GroundTruth) // Type sufficiently large to index symbols within the input and output (may be unsigned) using SymbolOffsetT = uint32_t; - // Helper class to set up transition table, symbol group lookup table, and translation table - using DfaFstT = cudf::io::fst::detail::Dfa; - // Prepare cuda stream for data transfers & kernels rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); @@ -167,7 +164,11 @@ TEST_F(FstTest, GroundTruth) cudf::detail::hostdevice_vector out_indexes_gpu(input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); // Allocate device-side temporary storage & run algorithm parser.Transduce(d_input.data(), diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index d0c16078329..e4d52a2953e 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1769,4 +1769,50 @@ TEST_F(JsonReaderTest, TrailingCommas) } } +TEST_F(JsonReaderTest, JSONLinesRecovering) +{ + std::string data = + // 0 -> a: -2 (valid) + R"({"a":-2})" + "\n" + // 1 -> (invalid) + R"({"a":])" + "\n" + // 2 -> (invalid) + R"({"b":{"a":[321})" + "\n" + // 3 -> c: [1] (valid) + R"({"c":1.2})" + "\n" + "\n" + // 4 -> a: 123 (valid) + R"({"a":123})"; + + auto filepath = temp_env->get_temp_dir() + "RecoveringLines.json"; + { + std::ofstream outfile(filepath, std::ofstream::out); + outfile << data; + } + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 2); + EXPECT_EQ(result.tbl->num_rows(), 5); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); + + std::vector a_validity{true, false, false, false, true}; + std::vector c_validity{false, false, false, true, false}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + int64_wrapper{{-2, 0, 0, 0, 123}, a_validity.cbegin()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), + float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0}, c_validity.cbegin()}); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp index a81348872cf..ad6678dbe5b 100644 --- a/cpp/tests/io/json_tree.cpp +++ b/cpp/tests/io/json_tree.cpp @@ -258,6 +258,7 @@ tree_meta_t2 get_tree_representation_cpu( case cuio_json::token_t::ValueEnd: return "VE"; case cuio_json::token_t::StructMemberBegin: return " <"; case cuio_json::token_t::StructMemberEnd: return " >"; + case cuio_json::token_t::LineEnd: return ";"; default: return "."; } }; diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index d82abdd1287..00d657108b8 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -32,6 +32,11 @@ #include #include +#include + +#include +#include + #include namespace cuio_json = cudf::io::json; @@ -163,7 +168,8 @@ TEST_F(JsonTest, StackContext) cudf::detail::hostdevice_vector stack_context(input.size(), stream); // Run algorithm - cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream); + constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset; + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream); // Copy back the results stack_context.device_to_host_async(stream); @@ -211,7 +217,8 @@ TEST_F(JsonTest, StackContextUtf8) cudf::detail::hostdevice_vector stack_context(input.size(), stream); // Run algorithm - cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream); + constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset; + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream); // Copy back the results stack_context.device_to_host_async(stream); @@ -229,6 +236,55 @@ TEST_F(JsonTest, StackContextUtf8) CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size()); } +TEST_F(JsonTest, StackContextRecovering) +{ + // Type used to represent the atomic symbol type used within the finite-state machine + using SymbolT = char; + using StackSymbolT = char; + + // Prepare cuda stream for data transfers & kernels + auto const stream = cudf::get_default_stream(); + + // JSON lines input that recovers on invalid lines + std::string const input = R"({"a":-2}, + {"a": + {"a":{"a":[321 + {"a":[1]} + + {"b":123} + )"; + + // Expected stack context (including stack context of the newline characters) + std::string const golden_stack_context = + "_{{{{{{{__" + "___{{{{{" + "___{{{{{{{{{{[[[[" + "___{{{{{[[{_" + "_" + "___{{{{{{{{_" + "__"; + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = + cudf::device_span{d_scalar.data(), static_cast(d_scalar.size())}; + cudf::detail::hostdevice_vector stack_context(input.size(), stream); + + // Run algorithm + constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter; + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream); + + // Copy back the results + stack_context.device_to_host_async(stream); + + // Make sure we copied back the stack context + stream.synchronize(); + + // Verify results + ASSERT_EQ(golden_stack_context.size(), stack_context.size()); + CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size()); +} + TEST_F(JsonTest, TokenStream) { using cuio_json::PdaTokenT; @@ -264,10 +320,8 @@ TEST_F(JsonTest, TokenStream) auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( d_input, default_options, stream, rmm::mr::get_current_device_resource()); // Copy back the number of tokens that were written - thrust::host_vector const tokens_gpu = - cudf::detail::make_host_vector_async(d_tokens_gpu, stream); - thrust::host_vector const token_indices_gpu = - cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); // Golden token stream sample using token_t = cuio_json::token_t; @@ -400,10 +454,8 @@ TEST_F(JsonTest, TokenStream2) auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( d_input, default_options, stream, rmm::mr::get_current_device_resource()); // Copy back the number of tokens that were written - thrust::host_vector const tokens_gpu = - cudf::detail::make_host_vector_async(d_tokens_gpu, stream); - thrust::host_vector const token_indices_gpu = - cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); // Golden token stream sample using token_t = cuio_json::token_t; @@ -487,6 +539,228 @@ TEST_P(JsonParserTest, ExtractColumn) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); } +TEST_F(JsonTest, RecoveringTokenStream) +{ + // Test input. Inline comments used to indicate character indexes + // 012345678 <= line 0 + std::string const input = R"({"a":-2},)" + // 9 + "\n" + // 01234 <= line 1 + R"({"a":)" + // 5 + "\n" + // 67890123456789 <= line 2 + R"({"a":{"a":[321)" + // 0 + "\n" + // 123456789 <= line 3 + R"({"a":[1]})" + // 0 + "\n" + // 1 <= line 4 + "\n" + // 23456789 <= line 5 + R"({"b":123})"; + + // Golden token stream sample + using token_t = cuio_json::token_t; + std::vector> const golden_token_stream = { + // Line 0 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 1 (valid) + {10, token_t::StructBegin}, + {11, token_t::StructMemberBegin}, + {11, token_t::FieldNameBegin}, + {13, token_t::FieldNameEnd}, + // Line 2 (valid) + {16, token_t::StructBegin}, + {17, token_t::StructMemberBegin}, + {17, token_t::FieldNameBegin}, + {19, token_t::FieldNameEnd}, + {21, token_t::StructBegin}, + {22, token_t::StructMemberBegin}, + {22, token_t::FieldNameBegin}, + {24, token_t::FieldNameEnd}, + {26, token_t::ListBegin}, + {27, token_t::ValueBegin}, + {30, token_t::ValueEnd}, + // Line 3 (valid) + {31, token_t::StructBegin}, + {32, token_t::StructMemberBegin}, + {32, token_t::FieldNameBegin}, + {34, token_t::FieldNameEnd}, + {36, token_t::ListBegin}, + {37, token_t::ValueBegin}, + {38, token_t::ValueEnd}, + {38, token_t::ListEnd}, + {39, token_t::StructMemberEnd}, + {39, token_t::StructEnd}, + // Line 4 (empty) + // Line 5 (valid) + {42, token_t::StructBegin}, + {43, token_t::StructMemberBegin}, + {43, token_t::FieldNameBegin}, + {45, token_t::FieldNameEnd}, + {47, token_t::ValueBegin}, + {50, token_t::ValueEnd}, + {50, token_t::StructMemberEnd}, + {50, token_t::StructEnd}}; + + auto const stream = cudf::get_default_stream(); + + // Default parsing options + cudf::io::json_reader_options default_options{}; + default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); + default_options.enable_lines(true); + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = cudf::device_span{ + d_scalar.data(), static_cast(d_scalar.size())}; + + // Parse the JSON and get the token stream + auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( + d_input, default_options, stream, rmm::mr::get_current_device_resource()); + // Copy back the number of tokens that were written + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); + + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); + + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; + // Ensure the token category is correct + EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i; + } +} + +TEST_F(JsonTest, PostProcessTokenStream) +{ + // Golden token stream sample + using token_t = cuio_json::token_t; + using token_index_t = cuio_json::SymbolOffsetT; + using tuple_t = thrust::tuple; + + std::vector const input = {// Line 0 (invalid) + {0, token_t::LineEnd}, + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {7, token_t::ValueEnd}, + {7, token_t::StructMemberEnd}, + {7, token_t::StructEnd}, + {8, token_t::ErrorBegin}, + {9, token_t::LineEnd}, + // Line 1 + {10, token_t::StructBegin}, + {11, token_t::StructMemberBegin}, + {11, token_t::FieldNameBegin}, + {13, token_t::FieldNameEnd}, + {15, token_t::LineEnd}, + // Line 2 (invalid) + {16, token_t::StructBegin}, + {17, token_t::StructMemberBegin}, + {17, token_t::FieldNameBegin}, + {19, token_t::FieldNameEnd}, + {21, token_t::StructBegin}, + {22, token_t::StructMemberBegin}, + {22, token_t::FieldNameBegin}, + {24, token_t::FieldNameEnd}, + {26, token_t::ListBegin}, + {27, token_t::ValueBegin}, + {29, token_t::ErrorBegin}, + {30, token_t::LineEnd}, + // Line 3 (invalid) + {31, token_t::StructBegin}, + {32, token_t::StructMemberBegin}, + {32, token_t::FieldNameBegin}, + {34, token_t::FieldNameEnd}, + {36, token_t::ListBegin}, + {37, token_t::ValueBegin}, + {38, token_t::ValueEnd}, + {38, token_t::ListEnd}, + {39, token_t::StructMemberEnd}, + {39, token_t::StructEnd}, + {40, token_t::ErrorBegin}, + {40, token_t::LineEnd}, + // Line 4 + {41, token_t::LineEnd}, + // Line 5 + {42, token_t::StructBegin}, + {43, token_t::StructMemberBegin}, + {43, token_t::FieldNameBegin}, + {45, token_t::FieldNameEnd}, + {47, token_t::ValueBegin}, + {50, token_t::ValueEnd}, + {50, token_t::StructMemberEnd}, + {50, token_t::StructEnd}}; + + std::vector const expected_output = {// Line 0 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 1 + {10, token_t::StructBegin}, + {11, token_t::StructMemberBegin}, + {11, token_t::FieldNameBegin}, + {13, token_t::FieldNameEnd}, + // Line 2 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 3 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 4 (empty) + // Line 5 + {42, token_t::StructBegin}, + {43, token_t::StructMemberBegin}, + {43, token_t::FieldNameBegin}, + {45, token_t::FieldNameEnd}, + {47, token_t::ValueBegin}, + {50, token_t::ValueEnd}, + {50, token_t::StructMemberEnd}, + {50, token_t::StructEnd}}; + + // Decompose tuples + auto const stream = cudf::get_default_stream(); + std::vector offsets(input.size()); + std::vector tokens(input.size()); + auto token_tuples = thrust::make_zip_iterator(offsets.begin(), tokens.begin()); + thrust::copy(input.cbegin(), input.cend(), token_tuples); + + // Initialize device-side test data + auto const d_offsets = cudf::detail::make_device_uvector_async( + cudf::host_span{offsets.data(), offsets.size()}, + stream, + rmm::mr::get_current_device_resource()); + auto const d_tokens = + cudf::detail::make_device_uvector_async(tokens, stream, rmm::mr::get_current_device_resource()); + + // Run system-under-test + auto [d_filtered_tokens, d_filtered_indices] = + cuio_json::detail::process_token_stream(d_tokens, d_offsets, stream); + + auto const filtered_tokens = cudf::detail::make_std_vector_async(d_filtered_tokens, stream); + auto const filtered_indices = cudf::detail::make_std_vector_async(d_filtered_indices, stream); + + // Verify the number of tokens matches + ASSERT_EQ(filtered_tokens.size(), expected_output.size()); + ASSERT_EQ(filtered_indices.size(), expected_output.size()); + + for (std::size_t i = 0; i < filtered_tokens.size(); i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(thrust::get<0>(expected_output[i]), filtered_indices[i]) << "Mismatch at #" << i; + // Ensure the token category is correct + EXPECT_EQ(thrust::get<1>(expected_output[i]), filtered_tokens[i]) << "Mismatch at #" << i; + } +} + TEST_P(JsonParserTest, UTF_JSON) { // Prepare cuda stream for data transfers & kernels