diff --git a/cpp/benchmarks/io/fst.cu b/cpp/benchmarks/io/fst.cu index 6b821eb5cae..c0c88517d41 100644 --- a/cpp/benchmarks/io/fst.cu +++ b/cpp/benchmarks/io/fst.cu @@ -67,10 +67,9 @@ auto make_test_json_data(nvbench::state& state) // Type used to represent the atomic symbol type used within the finite-state machine using SymbolT = char; // Type sufficiently large to index symbols within the input and output (may be unsigned) -using SymbolOffsetT = uint32_t; -// Helper class to set up transition table, symbol group lookup table, and translation table -using DfaFstT = cudf::io::fst::detail::Dfa; -constexpr std::size_t single_item = 1; +using SymbolOffsetT = uint32_t; +constexpr std::size_t single_item = 1; +constexpr auto max_translation_table_size = TT_NUM_STATES * NUM_SYMBOL_GROUPS; } // namespace @@ -94,7 +93,11 @@ void BM_FST_JSON(nvbench::state& state) cudf::detail::hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { @@ -129,7 +132,11 @@ void BM_FST_JSON_no_outidx(nvbench::state& state) cudf::detail::hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { @@ -162,7 +169,11 @@ void BM_FST_JSON_no_out(nvbench::state& state) cudf::detail::hostdevice_vector output_gpu_size(single_item, stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { @@ -196,7 +207,11 @@ void BM_FST_JSON_no_str(nvbench::state& state) cudf::detail::hostdevice_vector out_indexes_gpu(d_input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) { diff --git a/cpp/include/cudf/io/detail/tokenize_json.hpp b/cpp/include/cudf/io/detail/tokenize_json.hpp index 4914f434c98..b2ea29a85c3 100644 --- a/cpp/include/cudf/io/detail/tokenize_json.hpp +++ b/cpp/include/cudf/io/detail/tokenize_json.hpp @@ -110,6 +110,8 @@ enum token_t : PdaTokenT { ValueEnd, /// Beginning-of-error token (on first encounter of a parsing error) ErrorBegin, + /// Delimiting a JSON line for error recovery + LineEnd, /// Total number of tokens NUM_TOKENS }; diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index 670409a898a..15dc2a614ad 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -54,6 +54,14 @@ struct schema_element { std::map child_types; }; +/** + * @brief Control the error recovery behavior of the json parser + */ +enum class json_recovery_mode_t { + FAIL, ///< Does not recover from an error when encountering an invalid format + RECOVER_WITH_NULL ///< Recovers from an error, replacing invalid records with null +}; + /** * @brief Input arguments to the `read_json` interface. * @@ -105,6 +113,9 @@ class json_reader_options { // Whether to keep the quote characters of string values bool _keep_quotes = false; + // Whether to recover after an invalid JSON line + json_recovery_mode_t _recovery_mode = json_recovery_mode_t::FAIL; + /** * @brief Constructor from source info. * @@ -235,6 +246,13 @@ class json_reader_options { */ bool is_enabled_keep_quotes() const { return _keep_quotes; } + /** + * @brief Queries the JSON reader's behavior on invalid JSON lines. + * + * @returns An enum that specifies the JSON reader's behavior on invalid JSON lines. + */ + json_recovery_mode_t recovery_mode() const { return _recovery_mode; } + /** * @brief Set data types for columns to be read. * @@ -305,6 +323,13 @@ class json_reader_options { * of string values */ void enable_keep_quotes(bool val) { _keep_quotes = val; } + + /** + * @brief Specifies the JSON reader's behavior on invalid JSON lines. + * + * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines. + */ + void set_recovery_mode(json_recovery_mode_t val) { _recovery_mode = val; } }; /** @@ -449,6 +474,18 @@ class json_reader_options_builder { return *this; } + /** + * @brief Specifies the JSON reader's behavior on invalid JSON lines. + * + * @param val An enum value to indicate the JSON reader's behavior on invalid JSON lines. + * @return this for chaining + */ + json_reader_options_builder& recovery_mode(json_recovery_mode_t val) + { + options._recovery_mode = val; + return *this; + } + /** * @brief move json_reader_options member once it's built. */ diff --git a/cpp/src/io/fst/agent_dfa.cuh b/cpp/src/io/fst/agent_dfa.cuh index 0c813c7917f..52fd039c097 100644 --- a/cpp/src/io/fst/agent_dfa.cuh +++ b/cpp/src/io/fst/agent_dfa.cuh @@ -83,16 +83,18 @@ class DFASimulationCallbackWrapper { if (!write) out_count = 0; } - template + template __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const character_index, StateIndexT const old_state, StateIndexT const new_state, - SymbolIndexT const symbol_id) + SymbolIndexT const symbol_id, + SymbolT const read_symbol) { - uint32_t const count = transducer_table(old_state, symbol_id); + uint32_t const count = transducer_table(old_state, symbol_id, read_symbol); if (write) { for (uint32_t out_char = 0; out_char < count; out_char++) { - out_it[out_count + out_char] = transducer_table(old_state, symbol_id, out_char); + out_it[out_count + out_char] = + transducer_table(old_state, symbol_id, out_char, read_symbol); out_idx_it[out_count + out_char] = offset + character_index; } } @@ -127,9 +129,10 @@ class StateVectorTransitionOp { { } - template + template __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, - SymbolIndexT const read_symbol_id) const + SymbolIndexT const& read_symbol_id, + SymbolT const& read_symbol) const { for (int32_t i = 0; i < NUM_INSTANCES; ++i) { state_vector[i] = transition_table(state_vector[i], read_symbol_id); @@ -154,15 +157,16 @@ struct StateTransitionOp { { } - template + template __host__ __device__ __forceinline__ void ReadSymbol(CharIndexT const& character_index, - SymbolIndexT const& read_symbol_id) + SymbolIndexT const& read_symbol_id, + SymbolT const& read_symbol) { // Remember what state we were in before we made the transition StateIndexT previous_state = state; state = transition_table(state, read_symbol_id); - callback_op.ReadSymbol(character_index, previous_state, state, read_symbol_id); + callback_op.ReadSymbol(character_index, previous_state, state, read_symbol_id, read_symbol); } }; @@ -230,7 +234,7 @@ struct AgentDFA { for (int32_t i = 0; i < NUM_SYMBOLS; ++i) { if (IS_FULL_BLOCK || threadIdx.x * SYMBOLS_PER_THREAD + i < max_num_chars) { auto matched_id = symbol_matcher(chars[i]); - callback_op.ReadSymbol(i, matched_id); + callback_op.ReadSymbol(i, matched_id, chars[i]); } } } @@ -253,7 +257,8 @@ struct AgentDFA { //--------------------------------------------------------------------- // LOADING FULL BLOCK OF CHARACTERS, NON-ALIASED //--------------------------------------------------------------------- - __device__ __forceinline__ void LoadBlock(CharT const* d_chars, + template + __device__ __forceinline__ void LoadBlock(CharInItT d_chars, OffsetT const block_offset, OffsetT const num_total_symbols, cub::Int2Type /*IS_FULL_BLOCK*/, @@ -261,7 +266,7 @@ struct AgentDFA { { CharT thread_chars[SYMBOLS_PER_THREAD]; - CharT const* d_block_symbols = d_chars + block_offset; + CharInItT d_block_symbols = d_chars + block_offset; cub::LoadDirectStriped(threadIdx.x, d_block_symbols, thread_chars); #pragma unroll @@ -273,7 +278,8 @@ struct AgentDFA { //--------------------------------------------------------------------- // LOADING PARTIAL BLOCK OF CHARACTERS, NON-ALIASED //--------------------------------------------------------------------- - __device__ __forceinline__ void LoadBlock(CharT const* d_chars, + template + __device__ __forceinline__ void LoadBlock(CharInItT d_chars, OffsetT const block_offset, OffsetT const num_total_symbols, cub::Int2Type /*IS_FULL_BLOCK*/, @@ -286,7 +292,7 @@ struct AgentDFA { // Last unit to be loaded is IDIV_CEIL(#SYM, SYMBOLS_PER_UNIT) OffsetT num_total_chars = num_total_symbols - block_offset; - CharT const* d_block_symbols = d_chars + block_offset; + CharInItT d_block_symbols = d_chars + block_offset; cub::LoadDirectStriped( threadIdx.x, d_block_symbols, thread_chars, num_total_chars); @@ -372,11 +378,26 @@ struct AgentDFA { } } + template + __device__ __forceinline__ void LoadBlock(CharInItT d_chars, + OffsetT const block_offset, + OffsetT const num_total_symbols) + { + // Check if we are loading a full tile of data + if (block_offset + SYMBOLS_PER_UINT_BLOCK < num_total_symbols) { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<1>()); + } else { + LoadBlock( + d_chars, block_offset, num_total_symbols, cub::Int2Type(), cub::Int2Type<1>()); + } + } + template __device__ __forceinline__ void GetThreadStateTransitionVector( SymbolMatcherT const& symbol_matcher, TransitionTableT const& transition_table, - CharT const* d_chars, + SymbolItT d_chars, OffsetT const block_offset, OffsetT const num_total_symbols, std::array& state_vector) @@ -416,7 +437,7 @@ struct AgentDFA { __device__ __forceinline__ void GetThreadStateTransitions( SymbolMatcherT const& symbol_matcher, TransitionTableT const& transition_table, - CharT const* d_chars, + SymbolItT d_chars, OffsetT const block_offset, OffsetT const num_total_symbols, StateIndexT& state, diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh index 27ce6403ee8..a5d32cba125 100644 --- a/cpp/src/io/fst/logical_stack.cuh +++ b/cpp/src/io/fst/logical_stack.cuh @@ -42,9 +42,10 @@ namespace cudf::io::fst { * @brief Describes the kind of stack operation. */ enum class stack_op_type : int8_t { - READ = 0, ///< Operation reading what is currently on top of the stack - PUSH = 1, ///< Operation pushing a new item on top of the stack - POP = 2 ///< Operation popping the item currently on top of the stack + READ = 0, ///< Operation reading what is currently on top of the stack + PUSH = 1, ///< Operation pushing a new item on top of the stack + POP = 2, ///< Operation popping the item currently on top of the stack + RESET = 3 ///< Operation popping all items currently on the stack }; namespace detail { @@ -119,9 +120,9 @@ struct StackSymbolToStackOp { { stack_op_type stack_op = symbol_to_stack_op_type(stack_symbol); // PUSH => +1, POP => -1, READ => 0 - int32_t level_delta = stack_op == stack_op_type::PUSH ? 1 - : stack_op == stack_op_type::POP ? -1 - : 0; + int32_t level_delta = (stack_op == stack_op_type::PUSH) ? 1 + : (stack_op == stack_op_type::POP) ? -1 + : 0; return StackOpT{static_cast(level_delta), stack_symbol}; } @@ -133,14 +134,20 @@ struct StackSymbolToStackOp { * @brief Binary reduction operator to compute the absolute stack level from relative stack levels * (i.e., +1 for a PUSH, -1 for a POP operation). */ +template struct AddStackLevelFromStackOp { template constexpr CUDF_HOST_DEVICE StackOp operator()( StackOp const& lhs, StackOp const& rhs) const { - StackLevelT new_level = lhs.stack_level + rhs.stack_level; + StackLevelT new_level = (symbol_to_stack_op_type(rhs.value) == stack_op_type::RESET) + ? 0 + : (lhs.stack_level + rhs.stack_level); return StackOp{new_level, rhs.value}; } + + /// Function object returning a stack operation type for a given stack symbol + StackSymbolToStackOpTypeT symbol_to_stack_op_type; }; /** @@ -323,13 +330,14 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, // Getting temporary storage requirements for the prefix sum of the stack level after each // operation - CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(nullptr, - stack_level_scan_bytes, - stack_symbols_in, - d_kv_operations.Current(), - detail::AddStackLevelFromStackOp{}, - num_symbols_in, - stream)); + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( + nullptr, + stack_level_scan_bytes, + stack_symbols_in, + d_kv_operations.Current(), + detail::AddStackLevelFromStackOp{symbol_to_stack_op}, + num_symbols_in, + stream)); // Getting temporary storage requirements for the stable radix sort (sorting by stack level of the // operations) @@ -393,13 +401,14 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols, d_kv_operations = cub::DoubleBuffer{d_kv_ops_current.data(), d_kv_ops_alt.data()}; // Compute prefix sum of the stack level after each operation - CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan(temp_storage.data(), - total_temp_storage_bytes, - stack_symbols_in, - d_kv_operations.Current(), - detail::AddStackLevelFromStackOp{}, - num_symbols_in, - stream)); + CUDF_CUDA_TRY(cub::DeviceScan::InclusiveScan( + temp_storage.data(), + total_temp_storage_bytes, + stack_symbols_in, + d_kv_operations.Current(), + detail::AddStackLevelFromStackOp{symbol_to_stack_op}, + num_symbols_in, + stream)); // Stable radix sort, sorting by stack level of the operations d_kv_operations_unsigned = cub::DoubleBuffer{ diff --git a/cpp/src/io/fst/lookup_tables.cuh b/cpp/src/io/fst/lookup_tables.cuh index 26f6891d963..c4176d5673f 100644 --- a/cpp/src/io/fst/lookup_tables.cuh +++ b/cpp/src/io/fst/lookup_tables.cuh @@ -22,20 +22,40 @@ #include +#include + #include #include +#include #include namespace cudf::io::fst::detail { +/** + * @brief Helper function object that delegates a lookup to a given lookup table without mapping any + * of the given arguments. + */ +struct IdentityOp { + template + __host__ __device__ __forceinline__ auto operator()(LookUpTableT const& lookup_table, + Args&&... args) const + { + return lookup_table.lookup(std::forward(args)...); + } +}; + /** * @brief Class template that can be plugged into the finite-state machine to look up the symbol * group index for a given symbol. Class template does not support multi-symbol lookups (i.e., no * look-ahead). The class uses shared memory for the lookups. * * @tparam SymbolT The symbol type being passed in to lookup the corresponding symbol group id + * @tparam PreMapOpT A function object that is invoked with `(lut, symbol)` and must return the + * symbol group index of `symbol`. `lut` is an instance of the lookup table and `symbol` is the + * symbol for which to get the symbol group index. If no particular mapping is needed, an instance + * of `IdentityOp` can be used. */ -template +template class SingleSymbolSmemLUT { private: // Type used for representing a symbol group id (i.e., what we return for a given symbol) @@ -50,32 +70,36 @@ class SingleSymbolSmemLUT { }; public: + using TempStorage = cub::Uninitialized<_TempStorage>; + struct KernelParameter { + using LookupTableT = SingleSymbolSmemLUT; + // sym_to_sgid[min(symbol,num_valid_entries)] -> symbol group index - SymbolT num_valid_entries; + uint32_t num_valid_entries; // sym_to_sgid[symbol] -> symbol group index SymbolGroupIdT sym_to_sgid[NUM_ENTRIES_PER_LUT]; - }; - using TempStorage = cub::Uninitialized<_TempStorage>; + // Function object that transforms a symbol to a symbol group id + PreMapOpT pre_map_op; + }; /** * @brief Initializes the given \p sgid_init with the symbol group lookups defined by \p * symbol_strings. * - * @param[out] sgid_init A hostdevice_vector that will be populated - * @param[in] symbol_strings Array of strings, where the i-th string holds all symbols + * @param symbol_strings Array of strings, where the i-th string holds all symbols * (characters!) that correspond to the i-th symbol group index - * @param[in] stream The stream that shall be used to cudaMemcpyAsync the lookup table + * @param stream The stream that shall be used to cudaMemcpyAsync the lookup table * @return */ template - static void InitDeviceSymbolGroupIdLut( - cudf::detail::hostdevice_vector& sgid_init, - SymbolGroupItT const& symbol_strings, - rmm::cuda_stream_view stream) + static KernelParameter InitDeviceSymbolGroupIdLut(SymbolGroupItT const& symbol_strings, + PreMapOpT pre_map_op) { + KernelParameter init_data{}; + // The symbol group index to be returned if none of the given symbols match SymbolGroupIdT no_match_id = symbol_strings.size(); @@ -83,9 +107,7 @@ class SingleSymbolSmemLUT { SymbolGroupIdT max_base_match_val = 0; // Initialize all entries: by default we return the no-match-id - std::fill(&sgid_init.host_ptr()->sym_to_sgid[0], - &sgid_init.host_ptr()->sym_to_sgid[NUM_ENTRIES_PER_LUT], - no_match_id); + std::fill(&init_data.sym_to_sgid[0], &init_data.sym_to_sgid[NUM_ENTRIES_PER_LUT], no_match_id); // Set up lookup table uint32_t sg_id = 0; @@ -94,22 +116,24 @@ class SingleSymbolSmemLUT { // Iterate over all symbols that belong to the current symbol group for (auto const& sg_symbol : sg_symbols) { max_base_match_val = std::max(max_base_match_val, static_cast(sg_symbol)); - sgid_init.host_ptr()->sym_to_sgid[static_cast(sg_symbol)] = sg_id; + init_data.sym_to_sgid[static_cast(sg_symbol)] = sg_id; } sg_id++; } // Initialize the out-of-bounds lookup: sym_to_sgid[max_base_match_val+1] -> no_match_id - sgid_init.host_ptr()->sym_to_sgid[max_base_match_val + 1] = no_match_id; + init_data.sym_to_sgid[max_base_match_val + 1] = no_match_id; // Alias memory / return memory requirements - sgid_init.host_ptr()->num_valid_entries = max_base_match_val + 1; + init_data.num_valid_entries = max_base_match_val + 1; + init_data.pre_map_op = pre_map_op; - sgid_init.host_to_device_async(stream); + return init_data; } _TempStorage& temp_storage; SymbolGroupIdT num_valid_entries; + PreMapOpT pre_map_op; __device__ __forceinline__ _TempStorage& PrivateStorage() { @@ -140,7 +164,14 @@ class SingleSymbolSmemLUT { #endif } - constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT const symbol) const + template + constexpr CUDF_HOST_DEVICE int32_t operator()(SymbolT_ const symbol) const + { + // Look up the symbol group for given symbol + return pre_map_op(*this, symbol); + } + + constexpr CUDF_HOST_DEVICE int32_t lookup(SymbolT const symbol) const { // Look up the symbol group for given symbol return temp_storage @@ -148,6 +179,95 @@ class SingleSymbolSmemLUT { } }; +/** + * @brief Creates a symbol group lookup table of type `SingleSymbolSmemLUT` that uses a two-staged + * lookup approach. @p pre_map_op is a function object invoked with `(lut, symbol)` that must return + * the symbol group id for the given `symbol`. `lut` is an instance of the lookup table + * and `symbol` is a symbol from the input tape. Usually, @p pre_map_op first maps a symbol from + * the input tape to an integral that is convertible to `symbol_t`. In a second stage, @p pre_map_op + * uses `lut`'s `lookup(mapped_symbol)` that maps that integral to the symbol group id. + * + * @tparam symbol_t Must be an integral type + * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka + * "other" symbol group) + * @tparam pre_map_op_t A unary function object type that returns the symbol group id + * @param symbol_strings An array of vectors, where all the symbols in the i-th vector are mapped to + * the i-th symbol group + * @param pre_map_op A unary function object type that returns the symbol group id for a symbol + * @return A symbol group lookup table + */ +template +auto make_symbol_group_lut( + std::array, NUM_SYMBOL_GROUPS> const& symbol_strings, + pre_map_op_t pre_map_op) +{ + using lookup_table_t = SingleSymbolSmemLUT; + return lookup_table_t::InitDeviceSymbolGroupIdLut(symbol_strings, pre_map_op); +} + +/** + * @brief Creates a symbol group lookup table of type `SingleSymbolSmemLUT` that uses a two-staged + * lookup approach. @p pre_map_op is a function object invoked with `(lut, symbol)` that must return + * the symbol group id for the given `symbol`. `lut` is an instance of the lookup table + * and `symbol` is a symbol from the input tape. Usually, @p pre_map_op first maps a symbol from + * the input tape to an integral that is convertible to `symbol_t`. In a second stage, @p pre_map_op + * uses `lut`'s `lookup(mapped_symbol)` that maps that integral to the symbol group id. + * + * @tparam symbol_t The type returned by @p pre_map_op must be assignable to `char` + * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka + * "other" symbol group) + * @tparam pre_map_op_t A unary function object type that returns the symbol group id for a symbol + * @param symbol_strings An array of strings, where all the characters in the i-th string are mapped + * to the i-th symbol group + * @param pre_map_op A unary function object type that returns the symbol group id for a symbol + * @return A symbol group lookup table + */ +template +auto make_symbol_group_lut(std::array const& symbol_strings, + pre_map_op_t pre_map_op) +{ + using symbol_t = char; + using lookup_table_t = SingleSymbolSmemLUT; + return lookup_table_t::InitDeviceSymbolGroupIdLut(symbol_strings, pre_map_op); +} + +/** + * @brief Creates a symbol group lookup table that maps a symbol to a symbol group id, requiring the + * symbol type from the input tape to be assignable to `symbol_t` and `symbol_t` to be of integral + * type. + * + * @tparam symbol_t The input tape's symbol type must be assignable to this type + * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka + * "other" symbol group) + * @param symbol_strings An array of vectors, where all the symbols in the i-th vector are mapped to + * the i-th symbol group + * @return A symbol group lookup table + */ +template +auto make_symbol_group_lut( + std::array, NUM_SYMBOL_GROUPS> const& symbol_strings) +{ + return make_symbol_group_lut(symbol_strings, IdentityOp{}); +} + +/** + * @brief Creates a symbol group lookup table that maps a symbol to a symbol group id, requiring the + * symbol type from the input tape to be assignable to `symbol_t` and `symbol_t` to be of integral + * type. + * + * @tparam symbol_t The input tape's symbol type must be assignable to this type + * @tparam NUM_SYMBOL_GROUPS The number of symbol groups, excluding the catchall symbol group (aka + * "other" symbol group) + * @param symbol_strings An array of strings, where all the characters in the i-th string are mapped + * to the i-th symbol group + * @return A symbol group lookup table + */ +template +auto make_symbol_group_lut(std::array const& symbol_strings) +{ + return make_symbol_group_lut(symbol_strings, IdentityOp{}); +} + /** * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a new target state. The * class uses shared memory for the lookups. @@ -166,18 +286,20 @@ class TransitionTable { }; public: - using TempStorage = cub::Uninitialized<_TempStorage>; + static constexpr int32_t NUM_STATES = MAX_NUM_STATES; + using TempStorage = cub::Uninitialized<_TempStorage>; struct KernelParameter { + using LookupTableT = TransitionTable; + ItemT transitions[MAX_NUM_STATES * MAX_NUM_SYMBOLS]; }; template - static void InitDeviceTransitionTable( - cudf::detail::hostdevice_vector& transition_table_init, - std::array, MAX_NUM_STATES> const& translation_table, - rmm::cuda_stream_view stream) + static KernelParameter InitDeviceTransitionTable( + std::array, MAX_NUM_STATES> const& translation_table) { + KernelParameter init_data{}; // translation_table[state][symbol] -> new state for (std::size_t state = 0; state < translation_table.size(); ++state) { for (std::size_t symbol = 0; symbol < translation_table[state].size(); ++symbol) { @@ -185,13 +307,12 @@ class TransitionTable { static_cast(translation_table[state][symbol]) <= std::numeric_limits::max(), "Target state index value exceeds value representable by the transition table's type"); - transition_table_init.host_ptr()->transitions[symbol * MAX_NUM_STATES + state] = + init_data.transitions[symbol * MAX_NUM_STATES + state] = static_cast(translation_table[state][symbol]); } } - // Copy transition table to device - transition_table_init.host_to_device_async(stream); + return init_data; } constexpr CUDF_HOST_DEVICE TransitionTable(KernelParameter const& kernel_param, @@ -235,24 +356,83 @@ class TransitionTable { } }; +/** + * @brief Creates a transition table of type `TransitionTable` that maps `(state_id, match_id)` + * pairs to the new target state for the given `(state_id, match_id)`-combination. + * + * @tparam StateIdT An integral type used to represent state indexes + * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + * @param transition_table The transition table + * @return A transition table of type `TransitionTable` + */ +template +auto make_transition_table( + std::array, MAX_NUM_STATES> const& transition_table) +{ + using transition_table_t = TransitionTable; + return transition_table_t::InitDeviceTransitionTable(transition_table); +} + +/** + * @brief Compile-time reflection to check if `OpT` type has the `TempStorage` and + * `KernelParameter` type members. + */ +template +struct is_complex_op : std::false_type {}; + +template +struct is_complex_op> + : std::true_type {}; + +/** + * @brief The device view that is passed to the finite-state transducer algorithm. Each of the + * lookup tables can either be a simple function object that defines the `operator()` required for + * respective lookup table or a complex class. + * + * @tparam SymbolGroupIdLookupT + * @tparam TransitionTableT + * @tparam TranslationTableT + * @tparam NUM_STATES + */ template class dfa_device_view { private: - using sgid_lut_init_t = typename SymbolGroupIdLookupT::KernelParameter; - using transition_table_init_t = typename TransitionTableT::KernelParameter; - using translation_table_init_t = typename TranslationTableT::KernelParameter; + // Complex symbol group lookup operators need to declare a `TempStorage` and `KernelParameter` + // type member that is passed during device-side initialization. + using sgid_lut_init_t = std::conditional_t::value, + typename SymbolGroupIdLookupT::KernelParameter, + SymbolGroupIdLookupT>; + + // Complex transition table lookup operators need to declare a `TempStorage` and + // `KernelParameter` type member that is passed during device-side initialization. + using transition_table_init_t = std::conditional_t::value, + typename TransitionTableT::KernelParameter, + TransitionTableT>; + + // Complex translation table lookup operators need to declare a `TempStorage` and + // `KernelParameter` type member that is passed during device-side initialization. + using translation_table_init_t = std::conditional_t::value, + typename TranslationTableT::KernelParameter, + TranslationTableT>; public: // The maximum number of states supported by this DFA instance // This is a value queried by the DFA simulation algorithm static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - using SymbolGroupStorageT = typename SymbolGroupIdLookupT::TempStorage; - using TransitionTableStorageT = typename TransitionTableT::TempStorage; - using TranslationTableStorageT = typename TranslationTableT::TempStorage; + using SymbolGroupStorageT = std::conditional_t::value, + typename SymbolGroupIdLookupT::TempStorage, + typename cub::NullType>; + using TransitionTableStorageT = std::conditional_t::value, + typename TransitionTableT::TempStorage, + typename cub::NullType>; + using TranslationTableStorageT = std::conditional_t::value, + typename TranslationTableT::TempStorage, + typename cub::NullType>; __device__ auto InitSymbolGroupLUT(SymbolGroupStorageT& temp_storage) { @@ -286,14 +466,16 @@ class dfa_device_view { /** * @brief Lookup table mapping (old_state, symbol_group_id) transitions to a sequence of symbols - * that the finite-state transducer is supposed to output for each transition. The class uses shared - * memory for the lookups. + * that the finite-state transducer is supposed to output for each transition. The class uses + * shared memory for the lookups. * * @tparam OutSymbolT The symbol type being output - * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output symbols + * @tparam OutSymbolOffsetT Type sufficiently large to index into the lookup table of output + * symbols * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols + * be used. */ template ; struct KernelParameter { + using LookupTableT = TransducerLookupTable; + OutSymbolOffsetT d_out_offsets[MAX_NUM_STATES * MAX_NUM_SYMBOLS + 1]; OutSymbolT d_out_symbols[MAX_TABLE_SIZE]; }; @@ -321,12 +509,11 @@ class TransducerLookupTable { * @note Synchronizes the thread block, if called from device, and, hence, requires all threads * of the thread block to call the constructor */ - static void InitDeviceTranslationTable( - cudf::detail::hostdevice_vector& translation_table_init, + static KernelParameter InitDeviceTranslationTable( std::array, MAX_NUM_SYMBOLS>, MAX_NUM_STATES> const& - translation_table, - rmm::cuda_stream_view stream) + translation_table) { + KernelParameter init_data; std::vector out_symbols; out_symbols.reserve(MAX_TABLE_SIZE); std::vector out_symbol_offsets; @@ -357,15 +544,11 @@ class TransducerLookupTable { CUDF_EXPECTS(out_symbols.size() <= MAX_TABLE_SIZE, "Unsupported translation table"); // Prepare host-side data to be copied and passed to the device - std::copy(std::cbegin(out_symbol_offsets), - std::cend(out_symbol_offsets), - translation_table_init.host_ptr()->d_out_offsets); - std::copy(std::cbegin(out_symbols), - std::cend(out_symbols), - translation_table_init.host_ptr()->d_out_symbols); - - // Copy data to device - translation_table_init.host_to_device_async(stream); + std::copy( + std::cbegin(out_symbol_offsets), std::cend(out_symbol_offsets), init_data.d_out_offsets); + std::copy(std::cbegin(out_symbols), std::cend(out_symbols), init_data.d_out_symbols); + + return init_data; } private: @@ -408,24 +591,130 @@ class TransducerLookupTable { #endif } - template - constexpr CUDF_HOST_DEVICE OutSymbolT operator()(StateIndexT const state_id, - SymbolIndexT const match_id, - RelativeOffsetT const relative_offset) const + template + constexpr CUDF_HOST_DEVICE auto operator()(StateIndexT const state_id, + SymbolIndexT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const /*read_symbol*/) const { auto offset = temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id] + relative_offset; return temp_storage.out_symbols[offset]; } - template + template constexpr CUDF_HOST_DEVICE OutSymbolOffsetT operator()(StateIndexT const state_id, - SymbolIndexT const match_id) const + SymbolIndexT const match_id, + SymbolT const /*read_symbol*/) const { return temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id + 1] - temp_storage.out_offset[state_id * MAX_NUM_SYMBOLS + match_id]; } }; +/** + * @brief Creates a translation table that maps (old_state, symbol_group_id) transitions to a + * sequence of symbols that the finite-state transducer is supposed to output for each transition. + * + * @tparam MAX_TABLE_SIZE The maximum number of items in the lookup table of output symbols + * be used + * @tparam OutSymbolT The symbol type being output + * @tparam MAX_NUM_SYMBOLS The maximum number of symbols being output by a single state transition + * @tparam MAX_NUM_STATES The maximum number of states that this lookup table shall support + * @param translation_table The translation table + * @return A translation table of type `TransducerLookupTable`. + */ +template +auto make_translation_table(std::array, MAX_NUM_SYMBOLS>, + MAX_NUM_STATES> const& translation_table) +{ + using OutSymbolOffsetT = int32_t; + using translation_table_t = TransducerLookupTable; + return translation_table_t::InitDeviceTranslationTable(translation_table); +} + +template +class TranslationOp { + private: + struct _TempStorage {}; + + public: + using TempStorage = cub::Uninitialized<_TempStorage>; + + struct KernelParameter { + using LookupTableT = TranslationOp; + TranslationOpT translation_op; + }; + + /** + * @brief Initializes the lookup table, primarily to be invoked from within device code but also + * provides host-side implementation for verification. + * @note Synchronizes the thread block, if called from device, and, hence, requires all threads + * of the thread block to call the constructor + */ + static KernelParameter InitDeviceTranslationTable(TranslationOpT translation_op) + { + return KernelParameter{translation_op}; + } + + private: + _TempStorage& temp_storage; + TranslationOpT translation_op; + + __device__ __forceinline__ _TempStorage& PrivateStorage() + { + __shared__ _TempStorage private_storage; + return private_storage; + } + + public: + CUDF_HOST_DEVICE TranslationOp(KernelParameter const& kernel_param, TempStorage& temp_storage) + : temp_storage(temp_storage.Alias()), translation_op(kernel_param.translation_op) + { + } + + template + constexpr CUDF_HOST_DEVICE auto operator()(StateIndexT const state_id, + SymbolIndexT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + return translation_op(*this, state_id, match_id, relative_offset, read_symbol); + } + + template + constexpr CUDF_HOST_DEVICE auto operator()(StateIndexT const state_id, + SymbolIndexT const match_id, + SymbolT const read_symbol) const + { + return translation_op(*this, state_id, match_id, read_symbol); + } +}; + +/** + * @brief Creates a simple translation table that uses a simple function object to retrieve the + * + * @tparam FunctorT A function object type that must implement two signatures: (1) with `(state_id, + * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)` + * @param map_op A function object that must implement two signatures: (1) with `(state_id, + * match_id, read_symbol)` and (2) with `(state_id, match_id, relative_offset, read_symbol)`. + * Invocations of the first signature, (1), must return the number of symbols that are emitted for + * the given transition. The second signature, (2), must return the i-th symbol to be emitted for + * that transition, where `i` corresponds to `relative_offse` + * @return A translation table of type `TranslationO` + */ +template +auto make_translation_functor(FunctorT map_op) +{ + return TranslationOp::InitDeviceTranslationTable(map_op); +} + /** * @brief Helper class to facilitate the specification and instantiation of a DFA (i.e., the * transition table and its number of states, the mapping of symbols to symbol groups, and the @@ -437,70 +726,32 @@ class TransducerLookupTable { * @tparam NUM_STATES The number of states defined by the DFA (the other dimension of the * transition table) */ -template +template class Dfa { - public: - // The maximum number of states supported by this DFA instance - // This is a value queried by the DFA simulation algorithm - static constexpr int32_t MAX_NUM_STATES = NUM_STATES; - - private: - // Symbol-group id lookup table - using SymbolGroupIdLookupT = detail::SingleSymbolSmemLUT; - using SymbolGroupIdInitT = typename SymbolGroupIdLookupT::KernelParameter; - - // Transition table - using TransitionTableT = detail::TransitionTable; - using TransitionTableInitT = typename TransitionTableT::KernelParameter; - - // Translation lookup table - using OutSymbolOffsetT = uint32_t; - using TranslationTableT = detail::TransducerLookupTable; - using TranslationTableInitT = typename TranslationTableT::KernelParameter; + static constexpr int32_t single_item = 1; + public: auto get_device_view() { - return dfa_device_view{ - sgid_init.d_begin(), transition_table_init.d_begin(), translation_table_init.d_begin()}; + return dfa_device_view{ + &init_data.d_begin()->sgid_lut_init, + &init_data.d_begin()->transition_table_init, + &init_data.d_begin()->translation_table_init}; } - public: - /** - * @brief Constructs a new DFA. - * - * @param symbol_vec Sequence container of symbol groups. Each symbol group is a sequence - * container to symbols within that group. The index of the symbol group containing a symbol being - * read will be used as symbol_gid of the transition and translation tables. - * @param tt_vec The transition table - * @param out_tt_vec The translation table - * @param stream The stream to which memory operations and kernels are getting dispatched to - */ - template - Dfa(SymbolGroupIdItT const& symbol_vec, - std::array, NUM_STATES> const& tt_vec, - std::array, NUM_SYMBOLS>, NUM_STATES> const& out_tt_vec, - cudaStream_t stream) + Dfa(SymbolGroupIdInitT const& sgid_lut_init, + TransitionTableInitT const& transition_table_init, + TranslationTableInitT const& translation_table_init, + rmm::cuda_stream_view stream) + : init_data{single_item, stream} { - constexpr std::size_t single_item = 1; - - sgid_init = cudf::detail::hostdevice_vector{single_item, stream}; - transition_table_init = - cudf::detail::hostdevice_vector{single_item, stream}; - translation_table_init = - cudf::detail::hostdevice_vector{single_item, stream}; - - // Initialize symbol group id lookup table - SymbolGroupIdLookupT::InitDeviceSymbolGroupIdLut(sgid_init, symbol_vec, stream); - - // Initialize state transition table - TransitionTableT::InitDeviceTransitionTable(transition_table_init, tt_vec, stream); - - // Initialize finite-state transducer lookup table - TranslationTableT::InitDeviceTranslationTable(translation_table_init, out_tt_vec, stream); + *init_data.host_ptr() = {sgid_lut_init, transition_table_init, translation_table_init}; + init_data.host_to_device_async(stream); } /** @@ -513,8 +764,8 @@ class Dfa { * indexes are written. * @tparam TransducedCountOutItT A single-item output iterator type to which the total number of * output symbols is written - * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and (b) - * the output symbols + * @tparam OffsetT A type large enough to index into either of both: (a) the input symbols and + * (b) the output symbols * @param d_chars Pointer to the input string of symbols * @param num_chars The total number of input symbols to process * @param d_out_it Random-access output iterator to which the transduced output is @@ -527,12 +778,12 @@ class Dfa { * "end-state" of the previous invocation of the algorithm. * @param stream CUDA stream to launch kernels within. Default is the null-stream. */ - template - void Transduce(SymbolT const* d_chars, + void Transduce(SymbolItT d_chars_it, OffsetT num_chars, TransducedOutItT d_out_it, TransducedIndexOutItT d_out_idx_it, @@ -545,7 +796,7 @@ class Dfa { DeviceTransduce(nullptr, temp_storage_bytes, this->get_device_view(), - d_chars, + d_chars_it, num_chars, d_out_it, d_out_idx_it, @@ -560,7 +811,7 @@ class Dfa { DeviceTransduce(temp_storage.data(), temp_storage_bytes, this->get_device_view(), - d_chars, + d_chars_it, num_chars, d_out_it, d_out_idx_it, @@ -570,9 +821,36 @@ class Dfa { } private: - cudf::detail::hostdevice_vector sgid_init{}; - cudf::detail::hostdevice_vector transition_table_init{}; - cudf::detail::hostdevice_vector translation_table_init{}; + struct host_device_data { + SymbolGroupIdInitT sgid_lut_init; + TransitionTableInitT transition_table_init; + TranslationTableInitT translation_table_init; + }; + cudf::detail::hostdevice_vector init_data{}; }; +/** + * @brief Creates a determninistic finite automaton (DFA) as specified by the triple of (symbol + * group, transition, translation)-lookup tables to be used with the finite-state transducer + * algorithm. + * + * @param sgid_lut_init Object used to initialize the symbol group lookup table + * @param transition_table_init Object used to initialize the transition table + * @param translation_table_init Object used to initialize the translation table + * @param stream The stream used to allocate and initialize device-side memory that is used to + * initialize the lookup tables + * @return A DFA of type `Dfa`. + */ +template +auto make_fst(SymbolGroupIdInitT const& sgid_lut_init, + TransitionTableInitT const& transition_table_init, + TranslationTableInitT const& translation_table_init, + rmm::cuda_stream_view stream) +{ + return Dfa( + sgid_lut_init, transition_table_init, translation_table_init, stream); +} + } // namespace cudf::io::fst::detail diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 1c7d5b11032..3bbfc4b5f83 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -44,6 +44,21 @@ struct tree_meta_t { */ enum class json_col_t : char { ListColumn, StructColumn, StringColumn, Unknown }; +/** + * @brief Enum class to specify whether we just push onto and pop from the stack or whether we also + * reset to an empty stack on a newline character. + */ +enum class stack_behavior_t : char { + /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop + /// from the stack + PushPopWithoutReset, + + /// Opening brackets and braces, [, {, push onto the stack, closing brackets and braces, ], }, pop + /// from the stack. Newline characters are considered delimiters and therefore reset to an empty + /// stack. + ResetOnDelimiter +}; + // Default name for a list's child column constexpr auto list_child_name{"element"}; @@ -175,12 +190,28 @@ namespace detail { * character of \p d_json_in, where a '{' represents that the corresponding input character is * within the context of a struct, a '[' represents that it is within the context of an array, and a * '_' symbol that it is at the root of the JSON. + * @param[in] stack_behavior Specifies the stack's behavior * @param[in] stream The cuda stream to dispatch GPU kernels to */ void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, + stack_behavior_t stack_behavior, rmm::cuda_stream_view stream); +/** + * @brief Post-processes a token stream that may contain tokens from invalid lines. Expects that the + * token stream begins with a LineEnd token. + * + * @param tokens The tokens to be post-processed + * @param token_indices The tokens' corresponding indices that are post-processed + * @param stream The cuda stream to dispatch GPU kernels to + * @return Returns the post-processed token stream + */ +std::pair, rmm::device_uvector> process_token_stream( + device_span tokens, + device_span token_indices, + rmm::cuda_stream_view stream); + /** * @brief Parses the given JSON string and generates a tree representation of the given input. * diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu index 26dffd3328a..3b6c2b18250 100644 --- a/cpp/src/io/json/nested_json_gpu.cu +++ b/cpp/src/io/json/nested_json_gpu.cu @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -39,8 +40,11 @@ #include #include +#include #include +#include #include +#include #include #include @@ -88,6 +92,115 @@ void check_input_size(std::size_t input_size) namespace cudf::io::json { +// FST to prune tokens of invalid lines for recovering JSON lines format +namespace token_filter { + +// Type used to represent the target state in the transition table +using StateT = char; + +// Type used to represent a symbol group id +using SymbolGroupT = uint8_t; + +/** + * @brief Definition of the DFA's states + */ +enum class dfa_states : StateT { VALID, INVALID, NUM_STATES }; + +// Aliases for readability of the transition table +constexpr auto TT_INV = dfa_states::INVALID; +constexpr auto TT_VLD = dfa_states::VALID; + +/** + * @brief Definition of the symbol groups + */ +enum class dfa_symbol_group_id : SymbolGroupT { + ERROR, ///< Error token symbol group + DELIMITER, ///< Record / line delimiter symbol group + OTHER_SYMBOLS, ///< Symbol group that implicitly matches all other tokens + NUM_SYMBOL_GROUPS ///< Total number of symbol groups +}; + +constexpr auto TT_NUM_STATES = static_cast(dfa_states::NUM_STATES); +constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NUM_SYMBOL_GROUPS); + +// Lookup table to map an input symbol (i.e., a token) to a symbol group +std::array, NUM_SYMBOL_GROUPS - 1> const symbol_groups{{ + {static_cast(token_t::ErrorBegin)}, // Symbols mapping to ERROR + {static_cast(token_t::LineEnd)} // Symbols mapping to DELIMITER +}}; + +/** + * @brief Function object to map (token,token_index) tuples to a symbol group. + */ +struct UnwrapTokenFromSymbolOp { + template + CUDF_HOST_DEVICE SymbolGroupT operator()(SymbolGroupLookupTableT const& sgid_lut, + thrust::tuple symbol) const + { + PdaTokenT const token_type = thrust::get<0>(symbol); + return sgid_lut.lookup(token_type); + } +}; + +/** + * @brief Translation function object that discards line delimiter tokens and tokens belonging to + * invalid lines. + */ +struct TransduceToken { + template + constexpr CUDF_HOST_DEVICE SymbolT operator()(TransducerTableT const&, + StateT const state_id, + SymbolGroupT const match_id, + RelativeOffsetT const relative_offset, + SymbolT const read_symbol) const + { + const bool is_end_of_invalid_line = + (state_id == static_cast(TT_INV) && + match_id == static_cast(dfa_symbol_group_id::DELIMITER)); + + if (is_end_of_invalid_line) { + return relative_offset == 0 ? SymbolT{token_t::StructEnd, 0} + : SymbolT{token_t::StructBegin, 0}; + } else { + return read_symbol; + } + } + + template + constexpr CUDF_HOST_DEVICE int32_t operator()(TransducerTableT const&, + StateT const state_id, + SymbolGroupT const match_id, + SymbolT const read_symbol) const + { + // Number of tokens emitted on invalid lines + constexpr int32_t num_inv_tokens = 2; + + const bool is_delimiter = match_id == static_cast(dfa_symbol_group_id::DELIMITER); + + // If state is either invalid or we're entering an invalid state, we discard tokens + const bool is_part_of_invalid_line = + (match_id != static_cast(dfa_symbol_group_id::ERROR) && + state_id == static_cast(TT_VLD)); + + // Indicates whether we transition from an invalid line to a potentially valid line + const bool is_end_of_invalid_line = (state_id == static_cast(TT_INV) && is_delimiter); + + int32_t const emit_count = + is_end_of_invalid_line ? num_inv_tokens : (is_part_of_invalid_line && !is_delimiter ? 1 : 0); + return emit_count; + } +}; + +// Transition table +std::array, TT_NUM_STATES> const transition_table{ + {/* IN_STATE ERROR DELIM OTHER */ + /* VALID */ {{TT_INV, TT_VLD, TT_VLD}}, + /* INVALID */ {{TT_INV, TT_VLD, TT_INV}}}}; + +// The DFA's starting state +constexpr auto start_state = static_cast(TT_VLD); +} // namespace token_filter + // JSON to stack operator DFA (Deterministic Finite Automata) namespace to_stack_op { @@ -129,6 +242,7 @@ enum class dfa_symbol_group_id : uint8_t { CLOSING_BRACKET, ///< Closing bracket SG: ] QUOTE_CHAR, ///< Quote character SG: " ESCAPE_CHAR, ///< Escape character SG: '\' + NEWLINE_CHAR, ///< Newline character SG: '\n' OTHER_SYMBOLS, ///< SG implicitly matching all other characters NUM_SYMBOL_GROUPS ///< Total number of symbol groups }; @@ -138,21 +252,29 @@ constexpr auto NUM_SYMBOL_GROUPS = static_cast(dfa_symbol_group_id::NU // The i-th string representing all the characters of a symbol group std::array const symbol_groups{ - {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}}}; + {{"{"}, {"["}, {"}"}, {"]"}, {"\""}, {"\\"}, {"\n"}}}; // Transition table std::array, TT_NUM_STATES> const transition_table{ - {/* IN_STATE { [ } ] " \ OTHER */ - /* TT_OOS */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS}}, - /* TT_STR */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR}}, - /* TT_ESC */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}}; + {/* IN_STATE { [ } ] " \ \n OTHER */ + /* TT_OOS */ {{TT_OOS, TT_OOS, TT_OOS, TT_OOS, TT_STR, TT_OOS, TT_OOS, TT_OOS}}, + /* TT_STR */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_OOS, TT_ESC, TT_STR, TT_STR}}, + /* TT_ESC */ {{TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR, TT_STR}}}}; // Translation table (i.e., for each transition, what are the symbols that we output) std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const translation_table{ - {/* IN_STATE { [ } ] " \ OTHER */ - /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}}}, - /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}}}, - /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}}}}}; + {/* IN_STATE { [ } ] " \ \n OTHER */ + /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {}, {}}}, + /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}, {}}}, + /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}}; + +// Translation table +std::array, NUM_SYMBOL_GROUPS>, TT_NUM_STATES> const + resetting_translation_table{ + {/* IN_STATE { [ } ] " \ \n OTHER */ + /* TT_OOS */ {{{'{'}, {'['}, {'}'}, {']'}, {}, {}, {'\n'}, {}}}, + /* TT_STR */ {{{}, {}, {}, {}, {}, {}, {}, {}}}, + /* TT_ESC */ {{{}, {}, {}, {}, {}, {}, {}, {}}}}}; // The DFA's starting state constexpr auto start_state = static_cast(TT_OOS); @@ -409,6 +531,27 @@ enum class pda_state_t : StateT { PD_NUM_STATES }; +enum class json_format_cfg_t { + // Format describing regular JSON + JSON, + + // Format describing permissive newline-delimited JSON + // I.e., newline characters are only treteated as delimiters at the root stack level + // E.g., this is treated as a single record: + // {"a": + // 123} + JSON_LINES, + + // Format describing strict newline-delimited JSON + // I.e., All newlines are delimiting a record, independent of the context they appear in + JSON_LINES_STRICT, + + // Transition table for parsing newline-delimited JSON that recovers from invalid JSON lines + // This format also follows `JSON_LINES_STRICT` behaviour + JSON_LINES_RECOVER + +}; + // Aliases for readability of the transition table constexpr auto PD_BOV = pda_state_t::PD_BOV; constexpr auto PD_BOA = pda_state_t::PD_BOA; @@ -430,68 +573,133 @@ constexpr auto start_state = static_cast(pda_state_t::PD_BOV); /** * @brief Getting the transition table */ -auto get_transition_table(bool newline_delimited_json) +auto get_transition_table(json_format_cfg_t format) { static_assert(static_cast(stack_symbol_group_id::STACK_ROOT) == 0); static_assert(static_cast(stack_symbol_group_id::STACK_LIST) == 1); static_assert(static_cast(stack_symbol_group_id::STACK_STRUCT) == 2); - // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace. - // Thas is, empty lines are ignored - auto const PD_ANL = newline_delimited_json ? PD_BOV : PD_PVL; std::array, PD_NUM_STATES> pda_tt; - // { [ } ] " \ , : space newline other - pda_tt[static_cast(pda_state_t::PD_BOV)] = { - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, - PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, - PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON}; - pda_tt[static_cast(pda_state_t::PD_BOA)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_LON, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_ERR}; - pda_tt[static_cast(pda_state_t::PD_LON)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_PVL, PD_LON, - PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_LON, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_LON}; - pda_tt[static_cast(pda_state_t::PD_STR)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; - pda_tt[static_cast(pda_state_t::PD_SCE)] = { - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, - PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; - pda_tt[static_cast(pda_state_t::PD_PVL)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ANL, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_ERR, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_ERR}; - pda_tt[static_cast(pda_state_t::PD_BFN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BFN, PD_ERR}; - pda_tt[static_cast(pda_state_t::PD_FLN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; - pda_tt[static_cast(pda_state_t::PD_FNE)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; - pda_tt[static_cast(pda_state_t::PD_PFN)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_PFN, PD_ERR}; - pda_tt[static_cast(pda_state_t::PD_ERR)] = { - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, - PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; + + if (format == json_format_cfg_t::JSON || format == json_format_cfg_t::JSON_LINES) { + // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace. + // Thas is, empty lines are ignored + // PD_ANL describes the target state after a new line on an empty stack (JSON root level) + auto const PD_ANL = (format == json_format_cfg_t::JSON) ? PD_PVL : PD_BOV; + + // First row: empty stack ("root" level of the JSON) + // Second row: '[' on top of stack (we're parsing a list value) + // Third row: '{' on top of stack (we're parsing a struct value) + // { [ } ] " \ , : space newline other + pda_tt[static_cast(pda_state_t::PD_BOV)] = { + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON}; + pda_tt[static_cast(pda_state_t::PD_BOA)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOA, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_LON)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_LON}; + pda_tt[static_cast(pda_state_t::PD_STR)] = { + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[static_cast(pda_state_t::PD_SCE)] = { + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[static_cast(pda_state_t::PD_PVL)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ANL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_PVL, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_PVL, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_BFN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BFN, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_FLN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[static_cast(pda_state_t::PD_FNE)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[static_cast(pda_state_t::PD_PFN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_PFN, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_ERR)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR}; + } + // Transition table for strict JSON lines (including recovery) + // Newlines are treated as record delimiters + else { + // In case of newline-delimited JSON, multiple newlines are ignored, similar to whitespace. + // Thas is, empty lines are ignored + // PD_ANL describes the target state after a new line after encountering error state + auto const PD_ANL = (format == json_format_cfg_t::JSON_LINES_RECOVER) ? PD_BOV : PD_ERR; + + // First row: empty stack ("root" level of the JSON) + // Second row: '[' on top of stack (we're parsing a list value) + // Third row: '{' on top of stack (we're parsing a struct value) + // { [ } ] " \ , : space newline other + pda_tt[static_cast(pda_state_t::PD_BOV)] = { + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON, + PD_BOA, PD_BOA, PD_ERR, PD_ERR, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_BOV, PD_LON}; + pda_tt[static_cast(pda_state_t::PD_BOA)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_BOA, PD_BOA, PD_ERR, PD_PVL, PD_STR, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOV, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BOA, PD_BOV, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_LON)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_LON, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_LON, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_LON}; + pda_tt[static_cast(pda_state_t::PD_STR)] = { + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_PVL, PD_SCE, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[static_cast(pda_state_t::PD_SCE)] = { + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, + PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR, PD_STR}; + pda_tt[static_cast(pda_state_t::PD_PVL)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_BOV, PD_ERR, PD_PVL, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_ERR, PD_PVL, PD_BOV, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_BFN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_PVL, PD_ERR, PD_FLN, PD_ERR, PD_ERR, PD_ERR, PD_BFN, PD_BOV, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_FLN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_PFN, PD_FNE, PD_FLN, PD_FLN, PD_FLN, PD_BOV, PD_FLN}; + pda_tt[static_cast(pda_state_t::PD_FNE)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, + PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN, PD_FLN}; + pda_tt[static_cast(pda_state_t::PD_PFN)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_BOV, PD_PFN, PD_BOV, PD_ERR}; + pda_tt[static_cast(pda_state_t::PD_ERR)] = { + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ANL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ANL, PD_ERR, + PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ERR, PD_ANL, PD_ERR}; + } return pda_tt; } /** * @brief Getting the translation table */ -auto get_translation_table() +auto get_translation_table(bool include_line_delimiter) { constexpr auto StructBegin = token_t::StructBegin; constexpr auto StructEnd = token_t::StructEnd; @@ -507,6 +715,15 @@ auto get_translation_table() constexpr auto ValueEnd = token_t::ValueEnd; constexpr auto ErrorBegin = token_t::ErrorBegin; + /** + * @brief Appends token_t::LineEnd token to the given token sequence, if and only if + * `include_line_delimiter` is true. + */ + auto nl_tokens = [include_line_delimiter](std::vector tokens) { + if (include_line_delimiter) { tokens.push_back(token_t::LineEnd); } + return tokens; + }; + std::array, NUM_PDA_SGIDS>, PD_NUM_STATES> pda_tlt; pda_tlt[static_cast(pda_state_t::PD_BOV)] = {{ /*ROOT*/ {StructBegin}, // OPENING_BRACE @@ -518,7 +735,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ValueBegin}, // OTHER /*LIST*/ {StructBegin}, // OPENING_BRACE @@ -530,7 +747,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ValueBegin}, // OTHER /*STRUCT*/ {StructBegin}, // OPENING_BRACE @@ -542,7 +759,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ValueBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_BOA)] = { { /*ROOT*/ @@ -555,7 +772,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK + nl_tokens({ErrorBegin}), // LINE_BREAK {ErrorBegin}, // OTHER /*LIST*/ {StructBegin}, // OPENING_BRACE @@ -567,7 +784,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ValueBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE @@ -579,7 +796,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_LON)] = { { /*ROOT*/ @@ -592,132 +809,132 @@ auto get_translation_table() {ErrorBegin}, // COMMA {ErrorBegin}, // COLON {ValueEnd}, // WHITE_SPACE - {ValueEnd}, // LINE_BREAK + nl_tokens({ValueEnd}), // LINE_BREAK {}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ValueEnd, ListEnd}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ValueEnd}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - {ValueEnd}, // LINE_BREAK - {}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ValueEnd, ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + nl_tokens({ValueEnd}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ValueEnd, StructMemberEnd, StructEnd}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ValueEnd, StructMemberEnd}, // COMMA - {ErrorBegin}, // COLON - {ValueEnd}, // WHITE_SPACE - {ValueEnd}, // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ValueEnd, StructMemberEnd, StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ValueEnd, StructMemberEnd}, // COMMA + {ErrorBegin}, // COLON + {ValueEnd}, // WHITE_SPACE + nl_tokens({ValueEnd}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_STR)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {StringEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {StringEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_SCE)] = {{ /*ROOT*/ + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_PVL)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ListEnd}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ListEnd}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -728,145 +945,145 @@ auto get_translation_table() {StructMemberEnd}, // COMMA {ErrorBegin}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_BFN)] = { - { /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + { /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {StructEnd}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {StructMemberBegin, FieldNameBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {ErrorBegin}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {StructEnd}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {StructMemberBegin, FieldNameBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {ErrorBegin}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FLN)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {FieldNameEnd}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {FieldNameEnd}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_FNE)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER - - pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER + + pda_tlt[static_cast(pda_state_t::PD_PFN)] = {{ /*ROOT*/ + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*LIST*/ - {ErrorBegin}, // OPENING_BRACE - {ErrorBegin}, // OPENING_BRACKET - {ErrorBegin}, // CLOSING_BRACE - {ErrorBegin}, // CLOSING_BRACKET - {ErrorBegin}, // QUOTE - {ErrorBegin}, // ESCAPE - {ErrorBegin}, // COMMA - {ErrorBegin}, // COLON - {ErrorBegin}, // WHITE_SPACE - {ErrorBegin}, // LINE_BREAK - {ErrorBegin}, // OTHER + {ErrorBegin}, // OPENING_BRACE + {ErrorBegin}, // OPENING_BRACKET + {ErrorBegin}, // CLOSING_BRACE + {ErrorBegin}, // CLOSING_BRACKET + {ErrorBegin}, // QUOTE + {ErrorBegin}, // ESCAPE + {ErrorBegin}, // COMMA + {ErrorBegin}, // COLON + {ErrorBegin}, // WHITE_SPACE + nl_tokens({ErrorBegin}), // LINE_BREAK + {ErrorBegin}, // OTHER /*STRUCT*/ {ErrorBegin}, // OPENING_BRACE {ErrorBegin}, // OPENING_BRACKET @@ -877,7 +1094,7 @@ auto get_translation_table() {ErrorBegin}, // COMMA {}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {ErrorBegin}}}; // OTHER pda_tlt[static_cast(pda_state_t::PD_ERR)] = {{ /*ROOT*/ @@ -890,32 +1107,32 @@ auto get_translation_table() {}, // COMMA {}, // COLON {}, // WHITE_SPACE - {}, // LINE_BREAK + nl_tokens({}), // LINE_BREAK {}, // OTHER /*LIST*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}, // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}, // OTHER /*STRUCT*/ - {}, // OPENING_BRACE - {}, // OPENING_BRACKET - {}, // CLOSING_BRACE - {}, // CLOSING_BRACKET - {}, // QUOTE - {}, // ESCAPE - {}, // COMMA - {}, // COLON - {}, // WHITE_SPACE - {}, // LINE_BREAK - {}}}; // OTHER + {}, // OPENING_BRACE + {}, // OPENING_BRACKET + {}, // CLOSING_BRACE + {}, // CLOSING_BRACKET + {}, // QUOTE + {}, // ESCAPE + {}, // COMMA + {}, // COLON + {}, // WHITE_SPACE + nl_tokens({}), // LINE_BREAK + {}}}; // OTHER return pda_tlt; } @@ -929,9 +1146,32 @@ struct JSONToStackOp { template constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const { - return (stack_symbol == '{' || stack_symbol == '[') ? fst::stack_op_type::PUSH - : (stack_symbol == '}' || stack_symbol == ']') ? fst::stack_op_type::POP - : fst::stack_op_type::READ; + switch (stack_symbol) { + case '{': + case '[': return fst::stack_op_type::PUSH; + case '}': + case ']': return fst::stack_op_type::POP; + default: return fst::stack_op_type::READ; + } + } +}; + +/** + * @brief Function object used to filter for brackets and braces that represent push and pop + * operations + */ +struct JSONWithRecoveryToStackOp { + template + constexpr CUDF_HOST_DEVICE fst::stack_op_type operator()(StackSymbolT const& stack_symbol) const + { + switch (stack_symbol) { + case '{': + case '[': return fst::stack_op_type::PUSH; + case '}': + case ']': return fst::stack_op_type::POP; + case '\n': return fst::stack_op_type::RESET; + default: return fst::stack_op_type::READ; + } } }; @@ -1030,6 +1270,7 @@ namespace detail { void get_stack_context(device_span json_in, SymbolT* d_top_of_stack, + stack_behavior_t stack_behavior, rmm::cuda_stream_view stream) { check_input_size(json_in.size()); @@ -1052,15 +1293,19 @@ void get_stack_context(device_span json_in, rmm::device_uvector stack_op_indices{json_in.size(), stream}; // Prepare finite-state transducer that only selects '{', '}', '[', ']' outside of quotes - using ToStackOpFstT = - cudf::io::fst::detail::Dfa( - to_stack_op::dfa_symbol_group_id::NUM_SYMBOL_GROUPS), - static_cast(to_stack_op::dfa_states::TT_NUM_STATES)>; - ToStackOpFstT json_to_stack_ops_fst{to_stack_op::symbol_groups, - to_stack_op::transition_table, - to_stack_op::translation_table, - stream}; + constexpr auto max_translation_table_size = + to_stack_op::NUM_SYMBOL_GROUPS * to_stack_op::TT_NUM_STATES; + + // Translation table specialized on the choice of whether to reset on newlines outside of strings + const auto translation_table = (stack_behavior == stack_behavior_t::ResetOnDelimiter) + ? to_stack_op::resetting_translation_table + : to_stack_op::translation_table; + + auto json_to_stack_ops_fst = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(to_stack_op::symbol_groups), + fst::detail::make_transition_table(to_stack_op::transition_table), + fst::detail::make_translation_table(translation_table), + stream); // "Search" for relevant occurrence of brackets and braces that indicate the beginning/end // of structs/lists @@ -1075,16 +1320,80 @@ void get_stack_context(device_span json_in, // Copy back to actual number of stack operations auto const num_stack_ops = d_num_stack_ops.value(stream); - // stack operations with indices are converted to top of the stack for each character in the input - fst::sparse_stack_op_to_top_of_stack( - stack_ops.data(), - device_span{stack_op_indices.data(), num_stack_ops}, - JSONToStackOp{}, - d_top_of_stack, - root_symbol, - read_symbol, - json_in.size(), + // Stack operations with indices are converted to top of the stack for each character in the input + if (stack_behavior == stack_behavior_t::ResetOnDelimiter) { + fst::sparse_stack_op_to_top_of_stack( + stack_ops.data(), + device_span{stack_op_indices.data(), num_stack_ops}, + JSONWithRecoveryToStackOp{}, + d_top_of_stack, + root_symbol, + read_symbol, + json_in.size(), + stream); + } else { + fst::sparse_stack_op_to_top_of_stack( + stack_ops.data(), + device_span{stack_op_indices.data(), num_stack_ops}, + JSONToStackOp{}, + d_top_of_stack, + root_symbol, + read_symbol, + json_in.size(), + stream); + } +} + +std::pair, rmm::device_uvector> process_token_stream( + device_span tokens, + device_span token_indices, + rmm::cuda_stream_view stream) +{ + // Instantiate FST for post-processing the token stream to remove all tokens that belong to an + // invalid JSON line + token_filter::UnwrapTokenFromSymbolOp sgid_op{}; + auto filter_fst = + fst::detail::make_fst(fst::detail::make_symbol_group_lut(token_filter::symbol_groups, sgid_op), + fst::detail::make_transition_table(token_filter::transition_table), + fst::detail::make_translation_functor(token_filter::TransduceToken{}), + stream); + + auto const mr = rmm::mr::get_current_device_resource(); + rmm::device_scalar d_num_selected_tokens(stream, mr); + rmm::device_uvector filtered_tokens_out{tokens.size(), stream, mr}; + rmm::device_uvector filtered_token_indices_out{tokens.size(), stream, mr}; + + // The FST is run on the reverse token stream, discarding all tokens between ErrorBegin and the + // next LineEnd (LineEnd, inv_token_0, inv_token_1, ..., inv_token_n, ErrorBegin, LineEnd, ...), + // emitting a [StructBegin, StructEnd] pair on the end of such an invalid line. In that example, + // inv_token_i for i in [0, n] together with the ErrorBegin are removed and replaced with + // StructBegin, StructEnd. Also, all LineEnd are removed as well, as these are not relevant after + // this stage anymore + filter_fst.Transduce( + thrust::make_reverse_iterator(thrust::make_zip_iterator(tokens.data(), token_indices.data()) + + tokens.size()), + static_cast(tokens.size()), + thrust::make_reverse_iterator( + thrust::make_zip_iterator(filtered_tokens_out.data(), filtered_token_indices_out.data()) + + tokens.size()), + thrust::make_discard_iterator(), + d_num_selected_tokens.data(), + token_filter::start_state, stream); + + auto const num_total_tokens = d_num_selected_tokens.value(stream); + rmm::device_uvector tokens_out{num_total_tokens, stream, mr}; + rmm::device_uvector token_indices_out{num_total_tokens, stream, mr}; + thrust::copy(rmm::exec_policy(stream), + filtered_tokens_out.end() - num_total_tokens, + filtered_tokens_out.end(), + tokens_out.data()); + thrust::copy(rmm::exec_policy(stream), + filtered_token_indices_out.end() - num_total_tokens, + filtered_token_indices_out.end(), + token_indices_out.data()); + + return std::make_pair(std::move(tokens_out), std::move(token_indices_out)); } std::pair, rmm::device_uvector> get_token_stream( @@ -1100,13 +1409,25 @@ std::pair, rmm::device_uvector> ge auto const new_line_delimited_json = options.is_enabled_lines(); + // (!new_line_delimited_json) => JSON + // (new_line_delimited_json and recover_from_error) => JSON_LINES_RECOVER + // (new_line_delimited_json and !recover_from_error) => JSON_LINES + auto format = new_line_delimited_json + ? (options.recovery_mode() == json_recovery_mode_t::RECOVER_WITH_NULL + ? tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER + : tokenizer_pda::json_format_cfg_t::JSON_LINES) + : tokenizer_pda::json_format_cfg_t::JSON; + // Prepare for PDA transducer pass, merging input symbols with stack symbols - rmm::device_uvector pda_sgids = [json_in, stream]() { + auto const recover_from_error = (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER); + rmm::device_uvector pda_sgids = [json_in, stream, recover_from_error]() { // Memory holding the top-of-stack stack context for the input rmm::device_uvector stack_op_indices{json_in.size(), stream}; // Identify what is the stack context for each input character (JSON-root, struct, or list) - get_stack_context(json_in, stack_op_indices.data(), stream); + auto const stack_behavior = recover_from_error ? stack_behavior_t::ResetOnDelimiter + : stack_behavior_t::PushPopWithoutReset; + get_stack_context(json_in, stack_op_indices.data(), stack_behavior, stream); rmm::device_uvector pda_sgids{json_in.size(), stream}; auto zip_in = thrust::make_zip_iterator(json_in.data(), stack_op_indices.data()); @@ -1118,22 +1439,21 @@ std::pair, rmm::device_uvector> ge return pda_sgids; }(); - // PDA transducer alias - using ToTokenStreamFstT = - cudf::io::fst::detail::Dfa( - tokenizer_pda::pda_state_t::PD_NUM_STATES)>; - // Instantiating PDA transducer - std::vector> pda_sgid_identity{tokenizer_pda::NUM_PDA_SGIDS}; + std::array, tokenizer_pda::NUM_PDA_SGIDS> pda_sgid_identity{}; std::generate(std::begin(pda_sgid_identity), std::end(pda_sgid_identity), [i = char{0}]() mutable { return std::vector{i++}; }); - ToTokenStreamFstT json_to_tokens_fst{pda_sgid_identity, - tokenizer_pda::get_transition_table(new_line_delimited_json), - tokenizer_pda::get_translation_table(), - stream}; + + constexpr auto max_translation_table_size = + tokenizer_pda::NUM_PDA_SGIDS * + static_cast(tokenizer_pda::pda_state_t::PD_NUM_STATES); + auto json_to_tokens_fst = fst::detail::make_fst( + fst::detail::make_symbol_group_lut(pda_sgid_identity), + fst::detail::make_transition_table(tokenizer_pda::get_transition_table(format)), + fst::detail::make_translation_table( + tokenizer_pda::get_translation_table(recover_from_error)), + stream); // Perform a PDA-transducer pass // Compute the maximum amount of tokens that can possibly be emitted for a given input size @@ -1145,21 +1465,34 @@ std::pair, rmm::device_uvector> ge auto const max_token_out_count = cudf::util::div_rounding_up_safe(json_in.size(), min_chars_per_struct) * max_tokens_per_struct; rmm::device_scalar num_written_tokens{stream}; - rmm::device_uvector tokens{max_token_out_count, stream, mr}; - rmm::device_uvector tokens_indices{max_token_out_count, stream, mr}; + // In case we're recovering on invalid JSON lines, post-processing the token stream requires to + // see a JSON-line delimiter as the very first item + SymbolOffsetT const delimiter_offset = + (format == tokenizer_pda::json_format_cfg_t::JSON_LINES_RECOVER ? 1 : 0); + rmm::device_uvector tokens{max_token_out_count + delimiter_offset, stream, mr}; + rmm::device_uvector tokens_indices{ + max_token_out_count + delimiter_offset, stream, mr}; json_to_tokens_fst.Transduce(pda_sgids.begin(), static_cast(json_in.size()), - tokens.data(), - tokens_indices.data(), + tokens.data() + delimiter_offset, + tokens_indices.data() + delimiter_offset, num_written_tokens.data(), tokenizer_pda::start_state, stream); - auto const num_total_tokens = num_written_tokens.value(stream); + auto const num_total_tokens = num_written_tokens.value(stream) + delimiter_offset; tokens.resize(num_total_tokens, stream); tokens_indices.resize(num_total_tokens, stream); + if (delimiter_offset == 1) { + tokens.set_element(0, token_t::LineEnd, stream); + auto [filtered_tokens, filtered_tokens_indices] = + process_token_stream(tokens, tokens_indices, stream); + tokens = std::move(filtered_tokens); + tokens_indices = std::move(filtered_tokens_indices); + } + CUDF_EXPECTS(num_total_tokens <= max_token_out_count, "Generated token count exceeds the expected token count"); @@ -1281,6 +1614,7 @@ void make_json_column(json_column& root_column, case token_t::ValueBegin: return "ValueBegin"; case token_t::ValueEnd: return "ValueEnd"; case token_t::ErrorBegin: return "ErrorBegin"; + case token_t::LineEnd: return "LineEnd"; default: return "Unknown"; } }; diff --git a/cpp/tests/io/fst/fst_test.cu b/cpp/tests/io/fst/fst_test.cu index 1970b29fee9..fd69251e4f5 100644 --- a/cpp/tests/io/fst/fst_test.cu +++ b/cpp/tests/io/fst/fst_test.cu @@ -129,9 +129,6 @@ TEST_F(FstTest, GroundTruth) // Type sufficiently large to index symbols within the input and output (may be unsigned) using SymbolOffsetT = uint32_t; - // Helper class to set up transition table, symbol group lookup table, and translation table - using DfaFstT = cudf::io::fst::detail::Dfa; - // Prepare cuda stream for data transfers & kernels rmm::cuda_stream stream{}; rmm::cuda_stream_view stream_view(stream); @@ -167,7 +164,11 @@ TEST_F(FstTest, GroundTruth) cudf::detail::hostdevice_vector out_indexes_gpu(input.size(), stream_view); // Run algorithm - DfaFstT parser{pda_sgs, pda_state_tt, pda_out_tt, stream.value()}; + auto parser = cudf::io::fst::detail::make_fst( + cudf::io::fst::detail::make_symbol_group_lut(pda_sgs), + cudf::io::fst::detail::make_transition_table(pda_state_tt), + cudf::io::fst::detail::make_translation_table(pda_out_tt), + stream); // Allocate device-side temporary storage & run algorithm parser.Transduce(d_input.data(), diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index d0c16078329..e4d52a2953e 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -1769,4 +1769,50 @@ TEST_F(JsonReaderTest, TrailingCommas) } } +TEST_F(JsonReaderTest, JSONLinesRecovering) +{ + std::string data = + // 0 -> a: -2 (valid) + R"({"a":-2})" + "\n" + // 1 -> (invalid) + R"({"a":])" + "\n" + // 2 -> (invalid) + R"({"b":{"a":[321})" + "\n" + // 3 -> c: [1] (valid) + R"({"c":1.2})" + "\n" + "\n" + // 4 -> a: 123 (valid) + R"({"a":123})"; + + auto filepath = temp_env->get_temp_dir() + "RecoveringLines.json"; + { + std::ofstream outfile(filepath, std::ofstream::out); + outfile << data; + } + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{filepath}) + .lines(true) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); + + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + + EXPECT_EQ(result.tbl->num_columns(), 2); + EXPECT_EQ(result.tbl->num_rows(), 5); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT64); + EXPECT_EQ(result.tbl->get_column(1).type().id(), cudf::type_id::FLOAT64); + + std::vector a_validity{true, false, false, false, true}; + std::vector c_validity{false, false, false, true, false}; + + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), + int64_wrapper{{-2, 0, 0, 0, 123}, a_validity.cbegin()}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(1), + float64_wrapper{{0.0, 0.0, 0.0, 1.2, 0.0}, c_validity.cbegin()}); +} + CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp index a81348872cf..ad6678dbe5b 100644 --- a/cpp/tests/io/json_tree.cpp +++ b/cpp/tests/io/json_tree.cpp @@ -258,6 +258,7 @@ tree_meta_t2 get_tree_representation_cpu( case cuio_json::token_t::ValueEnd: return "VE"; case cuio_json::token_t::StructMemberBegin: return " <"; case cuio_json::token_t::StructMemberEnd: return " >"; + case cuio_json::token_t::LineEnd: return ";"; default: return "."; } }; diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index d82abdd1287..00d657108b8 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -32,6 +32,11 @@ #include #include +#include + +#include +#include + #include namespace cuio_json = cudf::io::json; @@ -163,7 +168,8 @@ TEST_F(JsonTest, StackContext) cudf::detail::hostdevice_vector stack_context(input.size(), stream); // Run algorithm - cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream); + constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset; + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream); // Copy back the results stack_context.device_to_host_async(stream); @@ -211,7 +217,8 @@ TEST_F(JsonTest, StackContextUtf8) cudf::detail::hostdevice_vector stack_context(input.size(), stream); // Run algorithm - cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stream); + constexpr auto stack_behavior = cuio_json::stack_behavior_t::PushPopWithoutReset; + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream); // Copy back the results stack_context.device_to_host_async(stream); @@ -229,6 +236,55 @@ TEST_F(JsonTest, StackContextUtf8) CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size()); } +TEST_F(JsonTest, StackContextRecovering) +{ + // Type used to represent the atomic symbol type used within the finite-state machine + using SymbolT = char; + using StackSymbolT = char; + + // Prepare cuda stream for data transfers & kernels + auto const stream = cudf::get_default_stream(); + + // JSON lines input that recovers on invalid lines + std::string const input = R"({"a":-2}, + {"a": + {"a":{"a":[321 + {"a":[1]} + + {"b":123} + )"; + + // Expected stack context (including stack context of the newline characters) + std::string const golden_stack_context = + "_{{{{{{{__" + "___{{{{{" + "___{{{{{{{{{{[[[[" + "___{{{{{[[{_" + "_" + "___{{{{{{{{_" + "__"; + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = + cudf::device_span{d_scalar.data(), static_cast(d_scalar.size())}; + cudf::detail::hostdevice_vector stack_context(input.size(), stream); + + // Run algorithm + constexpr auto stack_behavior = cuio_json::stack_behavior_t::ResetOnDelimiter; + cuio_json::detail::get_stack_context(d_input, stack_context.device_ptr(), stack_behavior, stream); + + // Copy back the results + stack_context.device_to_host_async(stream); + + // Make sure we copied back the stack context + stream.synchronize(); + + // Verify results + ASSERT_EQ(golden_stack_context.size(), stack_context.size()); + CUDF_TEST_EXPECT_VECTOR_EQUAL(golden_stack_context, stack_context, stack_context.size()); +} + TEST_F(JsonTest, TokenStream) { using cuio_json::PdaTokenT; @@ -264,10 +320,8 @@ TEST_F(JsonTest, TokenStream) auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( d_input, default_options, stream, rmm::mr::get_current_device_resource()); // Copy back the number of tokens that were written - thrust::host_vector const tokens_gpu = - cudf::detail::make_host_vector_async(d_tokens_gpu, stream); - thrust::host_vector const token_indices_gpu = - cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); // Golden token stream sample using token_t = cuio_json::token_t; @@ -400,10 +454,8 @@ TEST_F(JsonTest, TokenStream2) auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( d_input, default_options, stream, rmm::mr::get_current_device_resource()); // Copy back the number of tokens that were written - thrust::host_vector const tokens_gpu = - cudf::detail::make_host_vector_async(d_tokens_gpu, stream); - thrust::host_vector const token_indices_gpu = - cudf::detail::make_host_vector_async(d_token_indices_gpu, stream); + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); // Golden token stream sample using token_t = cuio_json::token_t; @@ -487,6 +539,228 @@ TEST_P(JsonParserTest, ExtractColumn) CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_col2, parsed_col2); } +TEST_F(JsonTest, RecoveringTokenStream) +{ + // Test input. Inline comments used to indicate character indexes + // 012345678 <= line 0 + std::string const input = R"({"a":-2},)" + // 9 + "\n" + // 01234 <= line 1 + R"({"a":)" + // 5 + "\n" + // 67890123456789 <= line 2 + R"({"a":{"a":[321)" + // 0 + "\n" + // 123456789 <= line 3 + R"({"a":[1]})" + // 0 + "\n" + // 1 <= line 4 + "\n" + // 23456789 <= line 5 + R"({"b":123})"; + + // Golden token stream sample + using token_t = cuio_json::token_t; + std::vector> const golden_token_stream = { + // Line 0 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 1 (valid) + {10, token_t::StructBegin}, + {11, token_t::StructMemberBegin}, + {11, token_t::FieldNameBegin}, + {13, token_t::FieldNameEnd}, + // Line 2 (valid) + {16, token_t::StructBegin}, + {17, token_t::StructMemberBegin}, + {17, token_t::FieldNameBegin}, + {19, token_t::FieldNameEnd}, + {21, token_t::StructBegin}, + {22, token_t::StructMemberBegin}, + {22, token_t::FieldNameBegin}, + {24, token_t::FieldNameEnd}, + {26, token_t::ListBegin}, + {27, token_t::ValueBegin}, + {30, token_t::ValueEnd}, + // Line 3 (valid) + {31, token_t::StructBegin}, + {32, token_t::StructMemberBegin}, + {32, token_t::FieldNameBegin}, + {34, token_t::FieldNameEnd}, + {36, token_t::ListBegin}, + {37, token_t::ValueBegin}, + {38, token_t::ValueEnd}, + {38, token_t::ListEnd}, + {39, token_t::StructMemberEnd}, + {39, token_t::StructEnd}, + // Line 4 (empty) + // Line 5 (valid) + {42, token_t::StructBegin}, + {43, token_t::StructMemberBegin}, + {43, token_t::FieldNameBegin}, + {45, token_t::FieldNameEnd}, + {47, token_t::ValueBegin}, + {50, token_t::ValueEnd}, + {50, token_t::StructMemberEnd}, + {50, token_t::StructEnd}}; + + auto const stream = cudf::get_default_stream(); + + // Default parsing options + cudf::io::json_reader_options default_options{}; + default_options.set_recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL); + default_options.enable_lines(true); + + // Prepare input & output buffers + cudf::string_scalar const d_scalar(input, true, stream); + auto const d_input = cudf::device_span{ + d_scalar.data(), static_cast(d_scalar.size())}; + + // Parse the JSON and get the token stream + auto [d_tokens_gpu, d_token_indices_gpu] = cuio_json::detail::get_token_stream( + d_input, default_options, stream, rmm::mr::get_current_device_resource()); + // Copy back the number of tokens that were written + auto const tokens_gpu = cudf::detail::make_std_vector_async(d_tokens_gpu, stream); + auto const token_indices_gpu = cudf::detail::make_std_vector_async(d_token_indices_gpu, stream); + + // Verify the number of tokens matches + ASSERT_EQ(golden_token_stream.size(), tokens_gpu.size()); + ASSERT_EQ(golden_token_stream.size(), token_indices_gpu.size()); + + for (std::size_t i = 0; i < tokens_gpu.size(); i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(golden_token_stream[i].first, token_indices_gpu[i]) << "Mismatch at #" << i; + // Ensure the token category is correct + EXPECT_EQ(golden_token_stream[i].second, tokens_gpu[i]) << "Mismatch at #" << i; + } +} + +TEST_F(JsonTest, PostProcessTokenStream) +{ + // Golden token stream sample + using token_t = cuio_json::token_t; + using token_index_t = cuio_json::SymbolOffsetT; + using tuple_t = thrust::tuple; + + std::vector const input = {// Line 0 (invalid) + {0, token_t::LineEnd}, + {0, token_t::StructBegin}, + {1, token_t::StructMemberBegin}, + {1, token_t::FieldNameBegin}, + {3, token_t::FieldNameEnd}, + {5, token_t::ValueBegin}, + {7, token_t::ValueEnd}, + {7, token_t::StructMemberEnd}, + {7, token_t::StructEnd}, + {8, token_t::ErrorBegin}, + {9, token_t::LineEnd}, + // Line 1 + {10, token_t::StructBegin}, + {11, token_t::StructMemberBegin}, + {11, token_t::FieldNameBegin}, + {13, token_t::FieldNameEnd}, + {15, token_t::LineEnd}, + // Line 2 (invalid) + {16, token_t::StructBegin}, + {17, token_t::StructMemberBegin}, + {17, token_t::FieldNameBegin}, + {19, token_t::FieldNameEnd}, + {21, token_t::StructBegin}, + {22, token_t::StructMemberBegin}, + {22, token_t::FieldNameBegin}, + {24, token_t::FieldNameEnd}, + {26, token_t::ListBegin}, + {27, token_t::ValueBegin}, + {29, token_t::ErrorBegin}, + {30, token_t::LineEnd}, + // Line 3 (invalid) + {31, token_t::StructBegin}, + {32, token_t::StructMemberBegin}, + {32, token_t::FieldNameBegin}, + {34, token_t::FieldNameEnd}, + {36, token_t::ListBegin}, + {37, token_t::ValueBegin}, + {38, token_t::ValueEnd}, + {38, token_t::ListEnd}, + {39, token_t::StructMemberEnd}, + {39, token_t::StructEnd}, + {40, token_t::ErrorBegin}, + {40, token_t::LineEnd}, + // Line 4 + {41, token_t::LineEnd}, + // Line 5 + {42, token_t::StructBegin}, + {43, token_t::StructMemberBegin}, + {43, token_t::FieldNameBegin}, + {45, token_t::FieldNameEnd}, + {47, token_t::ValueBegin}, + {50, token_t::ValueEnd}, + {50, token_t::StructMemberEnd}, + {50, token_t::StructEnd}}; + + std::vector const expected_output = {// Line 0 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 1 + {10, token_t::StructBegin}, + {11, token_t::StructMemberBegin}, + {11, token_t::FieldNameBegin}, + {13, token_t::FieldNameEnd}, + // Line 2 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 3 (invalid) + {0, token_t::StructBegin}, + {0, token_t::StructEnd}, + // Line 4 (empty) + // Line 5 + {42, token_t::StructBegin}, + {43, token_t::StructMemberBegin}, + {43, token_t::FieldNameBegin}, + {45, token_t::FieldNameEnd}, + {47, token_t::ValueBegin}, + {50, token_t::ValueEnd}, + {50, token_t::StructMemberEnd}, + {50, token_t::StructEnd}}; + + // Decompose tuples + auto const stream = cudf::get_default_stream(); + std::vector offsets(input.size()); + std::vector tokens(input.size()); + auto token_tuples = thrust::make_zip_iterator(offsets.begin(), tokens.begin()); + thrust::copy(input.cbegin(), input.cend(), token_tuples); + + // Initialize device-side test data + auto const d_offsets = cudf::detail::make_device_uvector_async( + cudf::host_span{offsets.data(), offsets.size()}, + stream, + rmm::mr::get_current_device_resource()); + auto const d_tokens = + cudf::detail::make_device_uvector_async(tokens, stream, rmm::mr::get_current_device_resource()); + + // Run system-under-test + auto [d_filtered_tokens, d_filtered_indices] = + cuio_json::detail::process_token_stream(d_tokens, d_offsets, stream); + + auto const filtered_tokens = cudf::detail::make_std_vector_async(d_filtered_tokens, stream); + auto const filtered_indices = cudf::detail::make_std_vector_async(d_filtered_indices, stream); + + // Verify the number of tokens matches + ASSERT_EQ(filtered_tokens.size(), expected_output.size()); + ASSERT_EQ(filtered_indices.size(), expected_output.size()); + + for (std::size_t i = 0; i < filtered_tokens.size(); i++) { + // Ensure the index the tokens are pointing to do match + EXPECT_EQ(thrust::get<0>(expected_output[i]), filtered_indices[i]) << "Mismatch at #" << i; + // Ensure the token category is correct + EXPECT_EQ(thrust::get<1>(expected_output[i]), filtered_tokens[i]) << "Mismatch at #" << i; + } +} + TEST_P(JsonParserTest, UTF_JSON) { // Prepare cuda stream for data transfers & kernels