From 9d8e43ef6ad75f6babc08fea88642ea006822e04 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Thu, 23 May 2024 11:41:49 -0400 Subject: [PATCH] Remove legacy JSON reader and concurrent_unordered_map.cuh. (#15813) This completes the final two steps and closes https://github.com/rapidsai/cudf/issues/15537. Also addresses one step of https://github.com/rapidsai/cudf/issues/12261. Authors: - Bradley Dice (https://github.com/bdice) Approvers: - Kyle Edwards (https://github.com/KyleFromNVIDIA) - David Wendt (https://github.com/davidwendt) - Shruti Shivakumar (https://github.com/shrshi) - Matthew Roeschke (https://github.com/mroeschke) URL: https://github.com/rapidsai/cudf/pull/15813 --- cpp/CMakeLists.txt | 2 - cpp/include/cudf/io/json.hpp | 32 - cpp/src/groupby/hash/groupby.cu | 1 - cpp/src/hash/concurrent_unordered_map.cuh | 557 --------------- cpp/src/hash/managed.cuh | 41 -- cpp/src/io/json/legacy/json_gpu.cu | 615 ---------------- cpp/src/io/json/legacy/json_gpu.hpp | 99 --- cpp/src/io/json/legacy/read_json.hpp | 38 - cpp/src/io/json/legacy/reader_impl.cu | 667 ------------------ cpp/src/io/json/read_json.cu | 9 - cpp/tests/CMakeLists.txt | 4 - cpp/tests/hash_map/map_test.cu | 217 ------ cpp/tests/io/json_test.cpp | 49 +- cpp/tests/io/nested_json_test.cpp | 2 +- python/cudf/cudf/_lib/json.pyx | 2 - .../cudf/_lib/pylibcudf/libcudf/io/json.pxd | 3 - python/cudf/cudf/io/json.py | 1 - 17 files changed, 8 insertions(+), 2331 deletions(-) delete mode 100644 cpp/src/hash/concurrent_unordered_map.cuh delete mode 100644 cpp/src/hash/managed.cuh delete mode 100644 cpp/src/io/json/legacy/json_gpu.cu delete mode 100644 cpp/src/io/json/legacy/json_gpu.hpp delete mode 100644 cpp/src/io/json/legacy/read_json.hpp delete mode 100644 cpp/src/io/json/legacy/reader_impl.cu delete mode 100644 cpp/tests/hash_map/map_test.cu diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 7390c465ccb..228d21ddccb 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -390,8 +390,6 @@ add_library( src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu src/io/json/read_json.cu - src/io/json/legacy/json_gpu.cu - src/io/json/legacy/reader_impl.cu src/io/json/parser_features.cpp src/io/json/write_json.cu src/io/orc/aggregate_orc_metadata.cpp diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index aa4bee4fb5e..65ba8f25577 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -270,15 +270,6 @@ class json_reader_options { */ bool is_enabled_dayfirst() const { return _dayfirst; } - /** - * @brief Whether the legacy reader should be used. - * - * @deprecated Since 24.06 - * - * @returns true if the legacy reader will be used, false otherwise - */ - [[deprecated]] bool is_enabled_legacy() const { return _legacy; } - /** * @brief Whether the reader should keep quotes of string values. * @@ -406,15 +397,6 @@ class json_reader_options { */ void enable_dayfirst(bool val) { _dayfirst = val; } - /** - * @brief Set whether to use the legacy reader. - * - * @deprecated Since 24.06 - * - * @param val Boolean value to enable/disable the legacy reader - */ - [[deprecated]] void enable_legacy(bool val) { _legacy = val; } - /** * @brief Set whether the reader should keep quotes of string values. * @@ -605,20 +587,6 @@ class json_reader_options_builder { return *this; } - /** - * @brief Set whether to use the legacy reader. - * - * @deprecated Since 24.06 - * - * @param val Boolean value to enable/disable legacy parsing - * @return this for chaining - */ - [[deprecated]] json_reader_options_builder& legacy(bool val) - { - options._legacy = val; - return *this; - } - /** * @brief Set whether the reader should keep quotes of string values. * diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu index 4f75ab19c66..0ec293ae3f0 100644 --- a/cpp/src/groupby/hash/groupby.cu +++ b/cpp/src/groupby/hash/groupby.cu @@ -16,7 +16,6 @@ #include "groupby/common/utils.hpp" #include "groupby/hash/groupby_kernels.cuh" -#include "hash/concurrent_unordered_map.cuh" #include #include diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh deleted file mode 100644 index a010a462de3..00000000000 --- a/cpp/src/hash/concurrent_unordered_map.cuh +++ /dev/null @@ -1,557 +0,0 @@ -/* - * Copyright (c) 2017-2024, NVIDIA CORPORATION. All rights reserved. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "hash/managed.cuh" - -#include -#include -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include -#include - -namespace { -template -struct packed { - using type = void; -}; -template <> -struct packed { - using type = uint64_t; -}; -template <> -struct packed { - using type = uint32_t; -}; -template -using packed_t = typename packed::type; - -/** - * @brief Indicates if a pair type can be packed. - * - * When the size of the key,value pair being inserted into the hash table is - * equal in size to a type where atomicCAS is natively supported, it is more - * efficient to "pack" the pair and insert it with a single atomicCAS. - * - * Only integral key and value types may be packed because we use - * bitwise equality comparison, which may not be valid for non-integral - * types. - * - * Also, the `pair_type` must not contain any padding bits otherwise - * accessing the packed value would be undefined. - * - * @tparam pair_type The pair type that will be packed - * @return true If the pair type can be packed - * @return false If the pair type cannot be packed - */ -template -constexpr bool is_packable() -{ - return std::is_integral_v and std::is_integral_v and - not std::is_void_v> and - std::has_unique_object_representations_v; -} - -/** - * @brief Allows viewing a pair in a packed representation - * - * Used as an optimization for inserting when a pair can be inserted with a - * single atomicCAS - */ -template -union pair_packer; - -template -union pair_packer()>> { - using packed_type = packed_t; - packed_type packed; - pair_type pair; - - __device__ pair_packer(pair_type _pair) : pair{_pair} {} - - __device__ pair_packer(packed_type _packed) : packed{_packed} {} -}; -} // namespace - -/** - * Supports concurrent insert, but not concurrent insert and find. - * - * @note The user is responsible for the following stream semantics: - * - Either the same stream should be used to create the map as is used by the kernels that access - * it, or - * - the stream used to create the map should be synchronized before it is accessed from a different - * stream or from host code. - * - * TODO: - * - add constructor that takes pointer to hash_table to avoid allocations - */ -template , - typename Equality = equal_to, - typename Allocator = rmm::mr::polymorphic_allocator>> -class concurrent_unordered_map { - public: - using size_type = size_t; - using hasher = Hasher; - using key_equal = Equality; - using allocator_type = Allocator; - using key_type = Key; - using mapped_type = Element; - using value_type = thrust::pair; - using iterator = cycle_iterator_adapter; - using const_iterator = cycle_iterator_adapter const; - - public: - /** - * @brief Factory to construct a new concurrent unordered map. - * - * Returns a `std::unique_ptr` to a new concurrent unordered map object. The - * map is non-owning and trivially copyable and should be passed by value into - * kernels. The `unique_ptr` contains a custom deleter that will free the - * map's contents. - * - * @note The implementation of this unordered_map uses sentinel values to - * indicate an entry in the hash table that is empty, i.e., if a hash bucket - * is empty, the pair residing there will be equal to (unused_key, - * unused_element). As a result, attempting to insert a key equal to - *`unused_key` results in undefined behavior. - * - * @note All allocations, kernels and copies in the constructor take place - * on stream but the constructor does not synchronize the stream. It is the user's - * responsibility to synchronize or use the same stream to access the map. - * - * @param capacity The maximum number of pairs the map may hold - * @param stream CUDA stream used for device memory operations and kernel launches. - * @param unused_element The sentinel value to use for an empty value - * @param unused_key The sentinel value to use for an empty key - * @param hash_function The hash function to use for hashing keys - * @param equal The equality comparison function for comparing if two keys are - * equal - * @param allocator The allocator to use for allocation the hash table's - * storage - */ - static auto create(size_type capacity, - rmm::cuda_stream_view stream, - mapped_type const unused_element = std::numeric_limits::max(), - key_type const unused_key = std::numeric_limits::max(), - Hasher const& hash_function = hasher(), - Equality const& equal = key_equal(), - allocator_type const& allocator = allocator_type()) - { - CUDF_FUNC_RANGE(); - using Self = concurrent_unordered_map; - - // Note: need `(*p).destroy` instead of `p->destroy` here - // due to compiler bug: https://github.com/rapidsai/cudf/pull/5692 - auto deleter = [stream](Self* p) { (*p).destroy(stream); }; - - return std::unique_ptr>{ - new Self(capacity, unused_element, unused_key, hash_function, equal, allocator, stream), - deleter}; - } - - /** - * @brief Returns an iterator to the first element in the map - * - * @note `__device__` code that calls this function should either run in the - * same stream as `create()`, or the accessing stream either be running on the - * same stream as create(), or the accessing stream should be appropriately - * synchronized with the creating stream. - * - * @returns iterator to the first element in the map. - */ - __device__ iterator begin() - { - return iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values); - } - - /** - * @brief Returns a constant iterator to the first element in the map - * - * @note `__device__` code that calls this function should either run in the - * same stream as `create()`, or the accessing stream either be running on the - * same stream as create(), or the accessing stream should be appropriately - * synchronized with the creating stream. - * - * @returns constant iterator to the first element in the map. - */ - __device__ const_iterator begin() const - { - return const_iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values); - } - - /** - * @brief Returns an iterator to the one past the last element in the map - * - * @note `__device__` code that calls this function should either run in the - * same stream as `create()`, or the accessing stream either be running on the - * same stream as create(), or the accessing stream should be appropriately - * synchronized with the creating stream. - * - * @returns iterator to the one past the last element in the map. - */ - __device__ iterator end() - { - return iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values + m_capacity); - } - - /** - * @brief Returns a constant iterator to the one past the last element in the map - * - * @note When called in a device code, user should make sure that it should - * either be running on the same stream as create(), or the accessing stream - * should be appropriately synchronized with the creating stream. - * - * @returns constant iterator to the one past the last element in the map. - */ - __device__ const_iterator end() const - { - return const_iterator( - m_hashtbl_values, m_hashtbl_values + m_capacity, m_hashtbl_values + m_capacity); - } - __host__ __device__ value_type* data() const { return m_hashtbl_values; } - - __host__ __device__ key_type get_unused_key() const { return m_unused_key; } - - __host__ __device__ mapped_type get_unused_element() const { return m_unused_element; } - - [[nodiscard]] __host__ __device__ size_type capacity() const { return m_capacity; } - - private: - /** - * @brief Enumeration of the possible results of attempting to insert into - *a hash bucket - */ - enum class insert_result { - CONTINUE, ///< Insert did not succeed, continue trying to insert - ///< (collision) - SUCCESS, ///< New pair inserted successfully - DUPLICATE ///< Insert did not succeed, key is already present - }; - - /** - * @brief Specialization for value types that can be packed. - * - * When the size of the key,value pair being inserted is equal in size to - *a type where atomicCAS is natively supported, this optimization path - *will insert the pair in a single atomicCAS operation. - */ - template - __device__ std::enable_if_t(), insert_result> attempt_insert( - value_type* const __restrict__ insert_location, value_type const& insert_pair) - { - pair_packer expected{thrust::make_pair(m_unused_key, m_unused_element)}; - pair_packer desired{insert_pair}; - - using packed_type = typename pair_packer::packed_type; - - auto* insert_ptr = reinterpret_cast(insert_location); - cuda::atomic_ref ref{*insert_ptr}; - auto const success = - ref.compare_exchange_strong(expected.packed, desired.packed, cuda::std::memory_order_relaxed); - - if (success) { - return insert_result::SUCCESS; - } else if (m_equal(expected.pair.first, insert_pair.first)) { - return insert_result::DUPLICATE; - } - return insert_result::CONTINUE; - } - - /** - * @brief Attempts to insert a key,value pair at the specified hash bucket. - * - * @param[in] insert_location Pointer to hash bucket to attempt insert - * @param[in] insert_pair The pair to insert - * @return Enum indicating result of insert attempt. - */ - template - __device__ std::enable_if_t(), insert_result> attempt_insert( - value_type* const __restrict__ insert_location, value_type const& insert_pair) - { - auto expected = m_unused_key; - cuda::atomic_ref ref{insert_location->first}; - auto const key_success = - ref.compare_exchange_strong(expected, insert_pair.first, cuda::std::memory_order_relaxed); - - // Hash bucket empty - if (key_success) { - insert_location->second = insert_pair.second; - return insert_result::SUCCESS; - } - // Key already exists - else if (m_equal(expected, insert_pair.first)) { - return insert_result::DUPLICATE; - } - - return insert_result::CONTINUE; - } - - public: - /** - * @brief Attempts to insert a key, value pair into the map. - * - * Returns an iterator, boolean pair. - * - * If the new key already present in the map, the iterator points to - * the location of the existing key and the boolean is `false` indicating - * that the insert did not succeed. - * - * If the new key was not present, the iterator points to the location - * where the insert occurred and the boolean is `true` indicating that the - *insert succeeded. - * - * @param insert_pair The key and value pair to insert - * @return Iterator, Boolean pair. Iterator is to the location of the - *newly inserted pair, or the existing pair that prevented the insert. - *Boolean indicates insert success. - */ - __device__ thrust::pair insert(value_type const& insert_pair) - { - size_type const key_hash{m_hf(insert_pair.first)}; - size_type index{key_hash % m_capacity}; - - insert_result status{insert_result::CONTINUE}; - - value_type* current_bucket{nullptr}; - - while (status == insert_result::CONTINUE) { - current_bucket = &m_hashtbl_values[index]; - status = attempt_insert(current_bucket, insert_pair); - index = (index + 1) % m_capacity; - } - - bool const insert_success = status == insert_result::SUCCESS; - - return thrust::make_pair( - iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, current_bucket), insert_success); - } - - /** - * @brief Searches the map for the specified key. - * - * @note `find` is not threadsafe with `insert`. I.e., it is not safe to - *do concurrent `insert` and `find` operations. - * - * @param k The key to search for - * @return An iterator to the key if it exists, else map.end() - */ - __device__ const_iterator find(key_type const& k) const - { - size_type const key_hash = m_hf(k); - size_type index = key_hash % m_capacity; - - value_type* current_bucket = &m_hashtbl_values[index]; - - while (true) { - key_type const existing_key = current_bucket->first; - - if (m_unused_key == existing_key) { return this->end(); } - - if (m_equal(k, existing_key)) { - return const_iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, current_bucket); - } - - index = (index + 1) % m_capacity; - current_bucket = &m_hashtbl_values[index]; - } - } - - /** - * @brief Searches the map for the specified key. - * - * This version of the find function specifies a hashing function and an - * equality comparison. This allows the caller to use different functions - * for insert and find (for example, when you want to insert keys from - * one table and use find to match keys from a different table with the - * keys from the first table). - * - * @note `find` is not threadsafe with `insert`. I.e., it is not safe to - * do concurrent `insert` and `find` operations. - * - * @tparam find_hasher Type of hashing function - * @tparam find_key_equal Type of equality comparison - * - * @param k The key to search for - * @param f_hash The hashing function to use to hash this key - * @param f_equal The equality function to use to compare this key with the - * contents of the hash table - * @return An iterator to the key if it exists, else map.end() - */ - template - __device__ const_iterator find(key_type const& k, - find_hasher f_hash, - find_key_equal f_equal) const - { - size_type const key_hash = f_hash(k); - size_type index = key_hash % m_capacity; - - value_type* current_bucket = &m_hashtbl_values[index]; - - while (true) { - key_type const existing_key = current_bucket->first; - - if (m_unused_key == existing_key) { return this->end(); } - - if (f_equal(k, existing_key)) { - return const_iterator(m_hashtbl_values, m_hashtbl_values + m_capacity, current_bucket); - } - - index = (index + 1) % m_capacity; - current_bucket = &m_hashtbl_values[index]; - } - } - - void assign_async(concurrent_unordered_map const& other, rmm::cuda_stream_view stream) - { - if (other.m_capacity <= m_capacity) { - m_capacity = other.m_capacity; - } else { - m_allocator.deallocate(m_hashtbl_values, m_capacity, stream); - m_capacity = other.m_capacity; - m_capacity = other.m_capacity; - - m_hashtbl_values = m_allocator.allocate(m_capacity, stream); - } - CUDF_CUDA_TRY(cudaMemcpyAsync(m_hashtbl_values, - other.m_hashtbl_values, - m_capacity * sizeof(value_type), - cudaMemcpyDefault, - stream.value())); - } - - void clear_async(rmm::cuda_stream_view stream) - { - constexpr int block_size = 128; - init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>( - m_hashtbl_values, m_capacity, m_unused_key, m_unused_element); - } - - void print() - { - for (size_type i = 0; i < m_capacity; ++i) { - std::cout << i << ": " << m_hashtbl_values[i].first << "," << m_hashtbl_values[i].second - << std::endl; - } - } - - void prefetch(int const dev_id, rmm::cuda_stream_view stream) - { - cudaPointerAttributes hashtbl_values_ptr_attributes; - cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values); - - if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) { - CUDF_CUDA_TRY(cudaMemPrefetchAsync( - m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value())); - } - CUDF_CUDA_TRY(cudaMemPrefetchAsync(this, sizeof(*this), dev_id, stream.value())); - } - - /** - * @brief Frees the contents of the map and destroys the map object. - * - * This function is invoked as the deleter of the `std::unique_ptr` returned - * from the `create()` factory function. - * - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - void destroy(rmm::cuda_stream_view stream) - { - m_allocator.deallocate(m_hashtbl_values, m_capacity, stream); - delete this; - } - - concurrent_unordered_map() = delete; - concurrent_unordered_map(concurrent_unordered_map const&) = default; - concurrent_unordered_map(concurrent_unordered_map&&) = default; - concurrent_unordered_map& operator=(concurrent_unordered_map const&) = default; - concurrent_unordered_map& operator=(concurrent_unordered_map&&) = default; - ~concurrent_unordered_map() = default; - - private: - hasher m_hf; - key_equal m_equal; - mapped_type m_unused_element; - key_type m_unused_key; - allocator_type m_allocator; - size_type m_capacity; - value_type* m_hashtbl_values; - - /** - * @brief Private constructor used by `create` factory function. - * - * @param capacity The desired m_capacity of the hash table - * @param unused_element The sentinel value to use for an empty value - * @param unused_key The sentinel value to use for an empty key - * @param hash_function The hash function to use for hashing keys - * @param equal The equality comparison function for comparing if two keys - *are equal - * @param allocator The allocator to use for allocation the hash table's - * storage - * @param stream CUDA stream used for device memory operations and kernel launches. - */ - concurrent_unordered_map(size_type capacity, - mapped_type const unused_element, - key_type const unused_key, - Hasher const& hash_function, - Equality const& equal, - allocator_type const& allocator, - rmm::cuda_stream_view stream) - : m_hf(hash_function), - m_equal(equal), - m_allocator(allocator), - m_capacity(capacity), - m_unused_element(unused_element), - m_unused_key(unused_key) - { - m_hashtbl_values = m_allocator.allocate(m_capacity, stream); - constexpr int block_size = 128; - { - cudaPointerAttributes hashtbl_values_ptr_attributes; - cudaError_t status = - cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values); - - if (cudaSuccess == status && isPtrManaged(hashtbl_values_ptr_attributes)) { - int dev_id = 0; - CUDF_CUDA_TRY(cudaGetDevice(&dev_id)); - CUDF_CUDA_TRY(cudaMemPrefetchAsync( - m_hashtbl_values, m_capacity * sizeof(value_type), dev_id, stream.value())); - } - } - - if (m_capacity > 0) { - init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>( - m_hashtbl_values, m_capacity, m_unused_key, m_unused_element); - } - - CUDF_CHECK_CUDA(stream.value()); - } -}; diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh deleted file mode 100644 index 9797c83c47c..00000000000 --- a/cpp/src/hash/managed.cuh +++ /dev/null @@ -1,41 +0,0 @@ -/* - * Copyright (c) 2017-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include -#include - -struct managed { - static void* operator new(size_t n) - { - void* ptr = nullptr; - cudaError_t result = cudaMallocManaged(&ptr, n); - if (cudaSuccess != result || 0 == ptr) throw std::bad_alloc(); - return ptr; - } - - static void operator delete(void* ptr) noexcept - { - auto const free_result = cudaFree(ptr); - assert(free_result == cudaSuccess); - } -}; - -inline bool isPtrManaged(cudaPointerAttributes attr) -{ - return (attr.type == cudaMemoryTypeManaged); -} diff --git a/cpp/src/io/json/legacy/json_gpu.cu b/cpp/src/io/json/legacy/json_gpu.cu deleted file mode 100644 index ff4845fcecb..00000000000 --- a/cpp/src/io/json/legacy/json_gpu.cu +++ /dev/null @@ -1,615 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "io/utilities/column_type_histogram.hpp" -#include "io/utilities/parsing_utils.cuh" -#include "io/utilities/trie.cuh" -#include "json_gpu.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using cudf::device_span; -using cudf::detail::grid_1d; - -namespace cudf::io::json::detail::legacy { - -namespace { -/** - * @brief CUDA Kernel that adjusts the row range to exclude the character outside of the top level - * brackets. - * - * The top level brackets characters are excluded from the resulting range. - * - * @param[in] begin Pointer to the first character in the row - * @param[in] end pointer to the first character after the row - */ -__device__ std::pair limit_range_to_brackets(char const* begin, - char const* end) -{ - auto const data_begin = thrust::next(thrust::find_if( - thrust::seq, begin, end, [] __device__(auto c) { return c == '[' || c == '{'; })); - auto const data_end = thrust::next(thrust::find_if(thrust::seq, - thrust::make_reverse_iterator(end), - thrust::make_reverse_iterator(data_begin), - [](auto c) { return c == ']' || c == '}'; })) - .base(); - return {data_begin, data_end}; -} - -/** - * @brief Find the first JSON object key in the range. - * - * Assumes that begin is not in the middle of a field. - * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range - * @param[in] quotechar The character used to denote quotes - * - * @return Begin and end iterators of the key name; (`end`, `end`) if a key is not found - */ -__device__ std::pair get_next_key(char const* begin, - char const* end, - char quotechar) -{ - // Key starts after the first quote - auto const key_begin = thrust::find(thrust::seq, begin, end, quotechar) + 1; - if (key_begin > end) return {end, end}; - - // Key ends after the next unescaped quote - auto const key_end_pair = thrust::mismatch( - thrust::seq, key_begin, end - 1, key_begin + 1, [quotechar] __device__(auto prev_ch, auto ch) { - return !(ch == quotechar && prev_ch != '\\'); - }); - - return {key_begin, key_end_pair.second}; -} - -/** - * @brief Returns true is the input character is a valid digit. - * Supports both decimal and hexadecimal digits (uppercase and lowercase). - * - * @param c Character to check - * @param is_hex Whether to check as a hexadecimal - * - * @return `true` if it is digit-like, `false` otherwise - */ -__device__ __inline__ bool is_digit(char c, bool is_hex = false) -{ - if (c >= '0' && c <= '9') return true; - - if (is_hex) { - if (c >= 'A' && c <= 'F') return true; - if (c >= 'a' && c <= 'f') return true; - } - - return false; -} - -/** - * @brief Returns true if the counters indicate a potentially valid float. - * False positives are possible because positions are not taken into account. - * For example, field "e.123-" would match the pattern. - */ -__device__ __inline__ bool is_like_float( - long len, long digit_cnt, long decimal_cnt, long dash_cnt, long exponent_cnt) -{ - // Can't have more than one exponent and one decimal point - if (decimal_cnt > 1) return false; - if (exponent_cnt > 1) return false; - // Without the exponent or a decimal point, this is an integer, not a float - if (decimal_cnt == 0 && exponent_cnt == 0) return false; - - // Can only have one '-' per component - if (dash_cnt > 1 + exponent_cnt) return false; - - // If anything other than these characters is present, it's not a float - if (digit_cnt + decimal_cnt + dash_cnt + exponent_cnt != len) return false; - - // Needs at least 1 digit, 2 if exponent is present - if (digit_cnt < 1 + exponent_cnt) return false; - - return true; -} - -/** - * @brief Contains information on a JSON file field. - */ -struct field_descriptor { - cudf::size_type column; - char const* value_begin; - char const* value_end; - bool is_quoted; -}; - -/** - * @brief Parse the first field in the given range and return its descriptor. - * - * @param[in] begin Pointer to the first character in the parsing range - * @param[in] end pointer to the first character after the parsing range - * @param[in] opts The global parsing behavior options - * @param[in] field_idx Index of the current field in the input row - * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. - * nullptr is passed when the input file does not consist of objects. - * @return Descriptor of the parsed field - */ -__device__ field_descriptor next_field_descriptor(char const* begin, - char const* end, - parse_options_view const& opts, - cudf::size_type field_idx, - col_map_type col_map) -{ - auto const desc_pre_trim = - col_map.capacity() == 0 - // No key - column and begin are trivial - ? field_descriptor{field_idx, - begin, - cudf::io::gpu::seek_field_end(begin, end, opts, true), - false} - : [&]() { - auto const key_range = get_next_key(begin, end, opts.quotechar); - auto const key_hash = cudf::hashing::detail::MurmurHash3_x86_32{}( - cudf::string_view(key_range.first, key_range.second - key_range.first)); - auto const hash_col = col_map.find(key_hash); - // Fall back to field index if not found (parsing error) - auto const column = (hash_col != col_map.end()) ? (*hash_col).second : field_idx; - - // Skip the colon between the key and the value - auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1; - return field_descriptor{column, - value_begin, - cudf::io::gpu::seek_field_end(value_begin, end, opts, true), - false}; - }(); - - // Modify start & end to ignore whitespace and quotechars - auto const trimmed_value_range = - trim_whitespaces(desc_pre_trim.value_begin, desc_pre_trim.value_end); - bool const is_quoted = - thrust::distance(trimmed_value_range.first, trimmed_value_range.second) >= 2 and - *trimmed_value_range.first == opts.quotechar and - *thrust::prev(trimmed_value_range.second) == opts.quotechar; - return {desc_pre_trim.column, - trimmed_value_range.first + static_cast(is_quoted), - trimmed_value_range.second - static_cast(is_quoted), - is_quoted}; -} - -/** - * @brief Returns the range that contains the data in a given row. - * - * Excludes the top-level brackets. - * - * @param[in] data Device span pointing to the JSON data in device memory - * @param[in] row_offsets The offset of each row in the input - * @param[in] row Index of the row for which the range is returned - * - * @return The begin and end iterators of the row data. - */ -__device__ std::pair get_row_data_range( - device_span const data, device_span const row_offsets, size_type row) -{ - auto const row_begin = data.begin() + row_offsets[row]; - auto const row_end = - data.begin() + ((row < row_offsets.size() - 1) ? row_offsets[row + 1] : data.size()); - return limit_range_to_brackets(row_begin, row_end); -} - -/** - * @brief CUDA kernel that parses and converts plain text data into cuDF column data. - * - * Data is processed one record at a time - * - * @param[in] opts A set of parsing options - * @param[in] data The entire data to read - * @param[in] row_offsets The offset of each row in the input - * @param[in] column_types The data type of each column - * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. - * nullptr is passed when the input file does not consist of objects. - * @param[out] output_columns The output column data - * @param[out] valid_fields The bitmaps indicating whether column fields are valid - * @param[out] num_valid_fields The numbers of valid fields in columns - */ -CUDF_KERNEL void convert_data_to_columns_kernel(parse_options_view opts, - device_span const data, - device_span const row_offsets, - device_span const column_types, - col_map_type col_map, - device_span const output_columns, - device_span const valid_fields, - device_span const num_valid_fields) -{ - auto const rec_id = grid_1d::global_thread_id(); - if (rec_id >= row_offsets.size()) return; - - auto const row_data_range = get_row_data_range(data, row_offsets, rec_id); - - auto current = row_data_range.first; - for (size_type input_field_index = 0; - input_field_index < column_types.size() && current < row_data_range.second; - input_field_index++) { - auto const desc = - next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map); - auto const value_len = static_cast(std::max(desc.value_end - desc.value_begin, 0L)); - auto const is_quoted = static_cast(desc.is_quoted); - - current = desc.value_end + 1; - - using string_index_pair = thrust::pair; - - if (!serialized_trie_contains(opts.trie_na, - {desc.value_begin - is_quoted, value_len + is_quoted * 2})) { - // Type dispatcher does not handle strings - if (column_types[desc.column].id() == type_id::STRING) { - auto str_list = static_cast(output_columns[desc.column]); - str_list[rec_id].first = desc.value_begin; - str_list[rec_id].second = value_len; - - // set the valid bitmap - all bits were set to 0 to start - set_bit(valid_fields[desc.column], rec_id); - atomicAdd(&num_valid_fields[desc.column], 1); - } else { - if (cudf::type_dispatcher(column_types[desc.column], - ConvertFunctor{}, - desc.value_begin, - desc.value_end, - output_columns[desc.column], - rec_id, - column_types[desc.column], - opts, - false)) { - // set the valid bitmap - all bits were set to 0 to start - set_bit(valid_fields[desc.column], rec_id); - atomicAdd(&num_valid_fields[desc.column], 1); - } - } - } else if (column_types[desc.column].id() == type_id::STRING) { - auto str_list = static_cast(output_columns[desc.column]); - str_list[rec_id].first = nullptr; - str_list[rec_id].second = 0; - } - } -} - -/** - * @brief CUDA kernel that processes a buffer of data and determines information about the - * column types within. - * - * Data is processed in one row/record at a time, so the number of total - * threads (tid) is equal to the number of rows. - * - * @param[in] opts A set of parsing options - * @param[in] data Input data buffer - * @param[in] rec_starts The offset of each row in the input - * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. - * nullptr is passed when the input file does not consist of objects. - * @param[in] num_columns The number of columns of input data - * @param[out] column_infos The count for each column data type - */ -CUDF_KERNEL void detect_data_types_kernel( - parse_options_view const opts, - device_span const data, - device_span const row_offsets, - col_map_type col_map, - int num_columns, - device_span const column_infos) -{ - auto const rec_id = grid_1d::global_thread_id(); - if (rec_id >= row_offsets.size()) return; - - auto const are_rows_objects = col_map.capacity() != 0; - auto const row_data_range = get_row_data_range(data, row_offsets, rec_id); - - size_type input_field_index = 0; - for (auto current = row_data_range.first; - input_field_index < num_columns && current < row_data_range.second; - input_field_index++) { - auto const desc = - next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map); - auto const value_len = static_cast(std::max(desc.value_end - desc.value_begin, 0L)); - - // Advance to the next field; +1 to skip the delimiter - current = desc.value_end + 1; - - // Checking if the field is empty/valid - if (serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) { - // Increase the null count for array rows, where the null count is initialized to zero. - if (!are_rows_objects) { atomicAdd(&column_infos[desc.column].null_count, 1); } - continue; - } else if (are_rows_objects) { - // For files with object rows, null count is initialized to row count. The value is decreased - // here for every valid field. - atomicAdd(&column_infos[desc.column].null_count, -1); - } - // Don't need counts to detect strings, any field in quotes is deduced to be a string - if (desc.is_quoted) { - atomicAdd(&column_infos[desc.column].string_count, 1); - continue; - } - - int digit_count = 0; - int decimal_count = 0; - int slash_count = 0; - int dash_count = 0; - int plus_count = 0; - int colon_count = 0; - int exponent_count = 0; - int other_count = 0; - - bool const maybe_hex = - ((value_len > 2 && *desc.value_begin == '0' && *(desc.value_begin + 1) == 'x') || - (value_len > 3 && *desc.value_begin == '-' && *(desc.value_begin + 1) == '0' && - *(desc.value_begin + 2) == 'x')); - for (auto pos = desc.value_begin; pos < desc.value_end; ++pos) { - if (is_digit(*pos, maybe_hex)) { - digit_count++; - continue; - } - // Looking for unique characters that will help identify column types - switch (*pos) { - case '.': decimal_count++; break; - case '-': dash_count++; break; - case '+': plus_count++; break; - case '/': slash_count++; break; - case ':': colon_count++; break; - case 'e': - case 'E': - if (!maybe_hex && pos > desc.value_begin && pos < desc.value_end - 1) exponent_count++; - break; - default: other_count++; break; - } - } - - // Integers have to have the length of the string - int int_req_number_cnt = value_len; - // Off by one if they start with a minus sign - if ((*desc.value_begin == '-' || *desc.value_begin == '+') && value_len > 1) { - --int_req_number_cnt; - } - // Off by one if they are a hexadecimal number - if (maybe_hex) { --int_req_number_cnt; } - if (serialized_trie_contains(opts.trie_true, {desc.value_begin, value_len}) || - serialized_trie_contains(opts.trie_false, {desc.value_begin, value_len})) { - atomicAdd(&column_infos[desc.column].bool_count, 1); - } else if (digit_count == int_req_number_cnt) { - bool is_negative = (*desc.value_begin == '-'); - char const* data_begin = desc.value_begin + (is_negative || (*desc.value_begin == '+')); - cudf::size_type* ptr = cudf::io::gpu::infer_integral_field_counter( - data_begin, data_begin + digit_count, is_negative, column_infos[desc.column]); - atomicAdd(ptr, 1); - } else if (is_like_float( - value_len, digit_count, decimal_count, dash_count + plus_count, exponent_count)) { - atomicAdd(&column_infos[desc.column].float_count, 1); - } - // A date-time field cannot have more than 3 non-special characters - // A number field cannot have more than one decimal point - else if (other_count > 3 || decimal_count > 1) { - atomicAdd(&column_infos[desc.column].string_count, 1); - } else { - // A date field can have either one or two '-' or '\'; A legal combination will only have one - // of them To simplify the process of auto column detection, we are not covering all the - // date-time formation permutations - if ((dash_count > 0 && dash_count <= 2 && slash_count == 0) || - (dash_count == 0 && slash_count > 0 && slash_count <= 2)) { - if (colon_count <= 2) { - atomicAdd(&column_infos[desc.column].datetime_count, 1); - } else { - atomicAdd(&column_infos[desc.column].string_count, 1); - } - } else { - // Default field type is string - atomicAdd(&column_infos[desc.column].string_count, 1); - } - } - } - if (!are_rows_objects) { - // For array rows, mark missing fields as null - for (; input_field_index < num_columns; ++input_field_index) - atomicAdd(&column_infos[input_field_index].null_count, 1); - } -} - -/** - * @brief Input data range that contains a field in key:value format. - */ -struct key_value_range { - char const* key_begin; - char const* key_end; - char const* value_begin; - char const* value_end; -}; - -/** - * @brief Parse the next field in key:value format and return ranges of its parts. - */ -__device__ key_value_range get_next_key_value_range(char const* begin, - char const* end, - parse_options_view const& opts) -{ - auto const key_range = get_next_key(begin, end, opts.quotechar); - - // Colon between the key and the value - auto const colon = thrust::find(thrust::seq, key_range.second, end, ':'); - if (colon == end) return {end, end, end}; - - // Field value (including delimiters) - auto const value_end = cudf::io::gpu::seek_field_end(colon + 1, end, opts, true); - return {key_range.first, key_range.second, colon + 1, value_end}; -} - -/** - * @brief Cuda kernel that collects information about JSON object keys in the file. - * - * @param[in] options A set of parsing options - * @param[in] data Input data buffer - * @param[in] row_offsets The offset of each row in the input - * @param[out] keys_cnt Number of keys found in the file - * @param[out] keys_info optional, information (offset, length, hash) for each found key - */ -CUDF_KERNEL void collect_keys_info_kernel(parse_options_view const options, - device_span const data, - device_span const row_offsets, - unsigned long long int* keys_cnt, - thrust::optional keys_info) -{ - auto const rec_id = grid_1d::global_thread_id(); - if (rec_id >= row_offsets.size()) return; - - auto const row_data_range = get_row_data_range(data, row_offsets, rec_id); - - auto advance = [&](char const* begin) { - return get_next_key_value_range(begin, row_data_range.second, options); - }; - for (auto field_range = advance(row_data_range.first); - field_range.key_begin < row_data_range.second; - field_range = advance(field_range.value_end)) { - auto const idx = atomicAdd(keys_cnt, 1ULL); - if (keys_info.has_value()) { - auto const len = field_range.key_end - field_range.key_begin; - keys_info->column(0).element(idx) = field_range.key_begin - data.begin(); - keys_info->column(1).element(idx) = len; - keys_info->column(2).element(idx) = - cudf::hashing::detail::MurmurHash3_x86_32{}( - cudf::string_view(field_range.key_begin, len)); - } - } -} - -} // namespace - -/** - * @copydoc cudf::io::json::detail::legacy::convert_json_to_columns - */ -void convert_json_to_columns(parse_options_view const& opts, - device_span const data, - device_span const row_offsets, - device_span const column_types, - col_map_type* col_map, - device_span const output_columns, - device_span const valid_fields, - device_span num_valid_fields, - rmm::cuda_stream_view stream) -{ - int block_size; - int min_grid_size; - CUDF_CUDA_TRY(cudaOccupancyMaxPotentialBlockSize( - &min_grid_size, &block_size, convert_data_to_columns_kernel)); - - int const grid_size = (row_offsets.size() + block_size - 1) / block_size; - - convert_data_to_columns_kernel<<>>(opts, - data, - row_offsets, - column_types, - *col_map, - output_columns, - valid_fields, - num_valid_fields); - - CUDF_CHECK_CUDA(stream.value()); -} - -/** - * @copydoc cudf::io::json::detail::legacy::detect_data_types - */ - -std::vector detect_data_types( - parse_options_view const& options, - device_span const data, - device_span const row_offsets, - bool do_set_null_count, - int num_columns, - col_map_type* col_map, - rmm::cuda_stream_view stream) -{ - int block_size; - int min_grid_size; - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, detect_data_types_kernel)); - - auto d_column_infos = [&]() { - if (do_set_null_count) { - rmm::device_uvector d_column_infos(num_columns, stream); - // Set the null count to the row count (all fields assumes to be null). - thrust::generate( - rmm::exec_policy(stream), - d_column_infos.begin(), - d_column_infos.end(), - [num_records = static_cast(row_offsets.size())] __device__() { - return cudf::io::column_type_histogram{num_records}; - }); - return d_column_infos; - } else { - return cudf::detail::make_zeroed_device_uvector_async( - num_columns, stream, rmm::mr::get_current_device_resource()); - } - }(); - - // Calculate actual block count to use based on records count - int const grid_size = (row_offsets.size() + block_size - 1) / block_size; - - detect_data_types_kernel<<>>( - options, data, row_offsets, *col_map, num_columns, d_column_infos); - - return cudf::detail::make_std_vector_sync(d_column_infos, stream); -} - -/** - * @copydoc cudf::io::json::detail::legacy::collect_keys_info - */ -void collect_keys_info(parse_options_view const& options, - device_span const data, - device_span const row_offsets, - unsigned long long int* keys_cnt, - thrust::optional keys_info, - rmm::cuda_stream_view stream) -{ - int block_size; - int min_grid_size; - CUDF_CUDA_TRY( - cudaOccupancyMaxPotentialBlockSize(&min_grid_size, &block_size, collect_keys_info_kernel)); - - // Calculate actual block count to use based on records count - int const grid_size = (row_offsets.size() + block_size - 1) / block_size; - - collect_keys_info_kernel<<>>( - options, data, row_offsets, keys_cnt, keys_info); - - CUDF_CHECK_CUDA(stream.value()); -} - -} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/legacy/json_gpu.hpp b/cpp/src/io/json/legacy/json_gpu.hpp deleted file mode 100644 index 853e30c9427..00000000000 --- a/cpp/src/io/json/legacy/json_gpu.hpp +++ /dev/null @@ -1,99 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#pragma once - -#include "hash/concurrent_unordered_map.cuh" -#include "io/utilities/column_type_histogram.hpp" -#include "io/utilities/parsing_utils.cuh" - -#include -#include -#include - -#include - -#include - -using cudf::device_span; - -namespace cudf::io::json::detail::legacy { - -using col_map_type = concurrent_unordered_map; -/** - * @brief Convert a buffer of input data (text) into raw cuDF column data. - * - * @param[in] options A set of parsing options - * @param[in] data The entire data to read - * @param[in] row_offsets The start of each data record - * @param[in] dtypes The data type of each column - * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. - * nullptr is passed when the input file does not consist of objects. - * @param[out] output_columns The output column data - * @param[out] valid_fields The bitmaps indicating whether column fields are valid - * @param[out] num_valid_fields The numbers of valid fields in columns - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - */ -void convert_json_to_columns(parse_options_view const& options, - device_span data, - device_span row_offsets, - device_span column_types, - col_map_type* col_map, - device_span output_columns, - device_span valid_fields, - device_span num_valid_fields, - rmm::cuda_stream_view stream); - -/** - * @brief Process a buffer of data and determine information about the column types within. - * - * @param[in] options A set of parsing options - * @param[in] data Input data buffer - * @param[in] row_offsets The offset of each row in the input - * @param[in] num_columns The number of columns of input data - * @param[in] col_map Pointer to the (column name hash -> column index) map in device memory. - * nullptr is passed when the input file does not consist of objects. - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * - * @returns The count for each column data type - */ -std::vector detect_data_types( - parse_options_view const& options, - device_span data, - device_span row_offsets, - bool do_set_null_count, - int num_columns, - col_map_type* col_map, - rmm::cuda_stream_view stream); - -/** - * @brief Collects information about JSON object keys in the file. - * - * @param[in] options A set of parsing options - * @param[in] data Input data buffer - * @param[in] row_offsets The offset of each row in the input - * @param[out] keys_cnt Number of keys found in the file - * @param[out] keys_info optional, information (offset, length, hash) for each found key - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - */ -void collect_keys_info(parse_options_view const& options, - device_span data, - device_span row_offsets, - unsigned long long int* keys_cnt, - thrust::optional keys_info, - rmm::cuda_stream_view stream); - -} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/legacy/read_json.hpp b/cpp/src/io/json/legacy/read_json.hpp deleted file mode 100644 index 2c02fdd402f..00000000000 --- a/cpp/src/io/json/legacy/read_json.hpp +++ /dev/null @@ -1,38 +0,0 @@ -/* - * Copyright (c) 2023-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include - -#include -#include - -#include - -#include -#include - -namespace cudf::io { -class json_reader_options; // forward decl -} - -namespace cudf::io::json::detail::legacy { - -table_with_metadata read_json(host_span> sources, - json_reader_options const& reader_opts, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); - -} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/legacy/reader_impl.cu b/cpp/src/io/json/legacy/reader_impl.cu deleted file mode 100644 index 846b3cfab4e..00000000000 --- a/cpp/src/io/json/legacy/reader_impl.cu +++ /dev/null @@ -1,667 +0,0 @@ -/* - * Copyright (c) 2020-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "hash/concurrent_unordered_map.cuh" -#include "io/comp/io_uncomp.hpp" -#include "io/utilities/column_buffer.hpp" -#include "io/utilities/parsing_utils.cuh" -#include "json_gpu.hpp" - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -using cudf::host_span; - -namespace cudf::io::json::detail::legacy { - -using col_map_ptr_type = std::unique_ptr>; - -/** - * @brief Aggregate the table containing keys info by their hash values. - * - * @param[in] info Table with columns containing key offsets, lengths and hashes, respectively - * - * @return Table with data aggregated by key hash values - */ -std::unique_ptr aggregate_keys_info(std::unique_ptr
info) -{ - auto const info_view = info->view(); - std::vector requests; - requests.emplace_back(groupby::aggregation_request{info_view.column(0)}); - requests.back().aggregations.emplace_back(make_min_aggregation()); - requests.back().aggregations.emplace_back(make_nth_element_aggregation(0)); - - requests.emplace_back(groupby::aggregation_request{info_view.column(1)}); - requests.back().aggregations.emplace_back(make_min_aggregation()); - requests.back().aggregations.emplace_back(make_nth_element_aggregation(0)); - - // Aggregate by hash values - groupby::groupby gb_obj( - table_view({info_view.column(2)}), null_policy::EXCLUDE, sorted::NO, {}, {}); - - auto result = gb_obj.aggregate(requests); // TODO: no stream parameter? - - std::vector> out_columns; - out_columns.emplace_back(std::move(result.second[0].results[0])); // offsets - out_columns.emplace_back(std::move(result.second[1].results[0])); // lengths - out_columns.emplace_back(std::move(result.first->release()[0])); // hashes - return std::make_unique
(std::move(out_columns)); -} - -/** - * @brief Initializes the (key hash -> column index) hash map. - */ -col_map_ptr_type create_col_names_hash_map(column_view column_name_hashes, - rmm::cuda_stream_view stream) -{ - auto key_col_map = col_map_type::create(column_name_hashes.size(), stream); - auto const column_data = column_name_hashes.data(); - thrust::for_each_n(rmm::exec_policy(stream), - thrust::make_counting_iterator(0), - column_name_hashes.size(), - [map = *key_col_map, column_data] __device__(size_type idx) mutable { - map.insert(thrust::make_pair(column_data[idx], idx)); - }); - return key_col_map; -} - -/** - * @brief Create a table whose columns contain the information on JSON objects' keys. - * - * The columns contain name offsets in the file, name lengths and name hashes, respectively. - * - * @param[in] options Parsing options (e.g. delimiter and quotation character) - * @param[in] data Input JSON device data - * @param[in] row_offsets Device array of row start locations in the input buffer - * @param[in] stream CUDA stream used for device memory operations and kernel launches - * - * @return std::unique_ptr
cudf table with three columns (offsets, lengths, hashes) - */ -std::unique_ptr
create_json_keys_info_table(parse_options_view const& parse_opts, - device_span const data, - device_span const row_offsets, - rmm::cuda_stream_view stream) -{ - // Count keys - rmm::device_scalar key_counter(0, stream); - collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {}, stream); - - // Allocate columns to store hash value, length, and offset of each JSON object key in the input - auto const num_keys = key_counter.value(stream); - std::vector> info_columns; - info_columns.emplace_back( - make_numeric_column(data_type(type_id::UINT64), num_keys, mask_state::UNALLOCATED, stream)); - info_columns.emplace_back( - make_numeric_column(data_type(type_id::UINT16), num_keys, mask_state::UNALLOCATED, stream)); - info_columns.emplace_back( - make_numeric_column(data_type(type_id::UINT32), num_keys, mask_state::UNALLOCATED, stream)); - // Create a table out of these columns to pass them around more easily - auto info_table = std::make_unique
(std::move(info_columns)); - auto const info_table_mdv = mutable_table_device_view::create(info_table->mutable_view(), stream); - - // Reset the key counter - now used for indexing - key_counter.set_value_to_zero_async(stream); - // Fill the allocated columns - collect_keys_info(parse_opts, data, row_offsets, key_counter.data(), {*info_table_mdv}, stream); - return info_table; -} - -/** - * @brief Extract the keys from the JSON file the name offsets/lengths. - */ -std::vector create_key_strings(char const* h_data, - table_view sorted_info, - rmm::cuda_stream_view stream) -{ - auto const num_cols = sorted_info.num_rows(); - std::vector h_offsets(num_cols); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_offsets.data(), - sorted_info.column(0).data(), - sizeof(uint64_t) * num_cols, - cudaMemcpyDefault, - stream.value())); - - std::vector h_lens(num_cols); - CUDF_CUDA_TRY(cudaMemcpyAsync(h_lens.data(), - sorted_info.column(1).data(), - sizeof(uint16_t) * num_cols, - cudaMemcpyDefault, - stream.value())); - - std::vector names(num_cols); - std::transform(h_offsets.cbegin(), - h_offsets.cend(), - h_lens.cbegin(), - names.begin(), - [&](auto offset, auto len) { return std::string(h_data + offset, len); }); - return names; -} - -auto sort_keys_info_by_offset(std::unique_ptr
info) -{ - auto const agg_offset_col_view = info->get_column(0).view(); - return sort_by_key(info->view(), table_view({agg_offset_col_view})); -} - -/** - * @brief Extract JSON object keys from a JSON file. - * - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * - * @return Names of JSON object keys in the file - */ -std::pair, col_map_ptr_type> get_json_object_keys_hashes( - parse_options_view const& parse_opts, - host_span h_data, - device_span rec_starts, - device_span d_data, - rmm::cuda_stream_view stream) -{ - auto info = create_json_keys_info_table(parse_opts, d_data, rec_starts, stream); - - auto aggregated_info = aggregate_keys_info(std::move(info)); - auto sorted_info = sort_keys_info_by_offset(std::move(aggregated_info)); - - return {create_key_strings(h_data.data(), sorted_info->view(), stream), - create_col_names_hash_map(sorted_info->get_column(2).view(), stream)}; -} - -std::vector ingest_raw_input(host_span> sources, - compression_type compression, - size_t range_offset, - size_t range_size, - size_t range_size_padded) -{ - CUDF_FUNC_RANGE(); - // Iterate through the user defined sources and read the contents into the local buffer - size_t total_source_size = 0; - for (auto const& source : sources) { - total_source_size += source->size(); - } - total_source_size = total_source_size - (range_offset * sources.size()); - - auto buffer = std::vector(total_source_size); - - size_t bytes_read = 0; - for (auto const& source : sources) { - if (!source->is_empty()) { - auto data_size = (range_size_padded != 0) ? range_size_padded : source->size(); - auto destination = buffer.data() + bytes_read; - bytes_read += source->host_read(range_offset, data_size, destination); - } - } - - if (compression == compression_type::NONE) { - return buffer; - } else { - return decompress(compression, buffer); - } -} - -bool should_load_whole_source(json_reader_options const& reader_opts) -{ - return reader_opts.get_byte_range_offset() == 0 and // - reader_opts.get_byte_range_size() == 0; -} - -rmm::device_uvector find_record_starts(json_reader_options const& reader_opts, - host_span h_data, - device_span d_data, - rmm::cuda_stream_view stream) -{ - std::vector chars_to_count{'\n'}; - // Currently, ignoring lineterminations within quotes is handled by recording the records of both, - // and then filtering out the records that is a quotechar or a linetermination within a quotechar - // pair. - // If not starting at an offset, add an extra row to account for the first row in the file - cudf::size_type prefilter_count = ((reader_opts.get_byte_range_offset() == 0) ? 1 : 0); - if (should_load_whole_source(reader_opts)) { - prefilter_count += count_all_from_set(d_data, chars_to_count, stream); - } else { - prefilter_count += count_all_from_set(h_data, chars_to_count, stream); - } - - rmm::device_uvector rec_starts(prefilter_count, stream); - - auto* find_result_ptr = rec_starts.data(); - // Manually adding an extra row to account for the first row in the file - if (reader_opts.get_byte_range_offset() == 0) { - find_result_ptr++; - CUDF_CUDA_TRY(cudaMemsetAsync(rec_starts.data(), 0ull, sizeof(uint64_t), stream.value())); - } - - std::vector chars_to_find{'\n'}; - // Passing offset = 1 to return positions AFTER the found character - if (should_load_whole_source(reader_opts)) { - find_all_from_set(d_data, chars_to_find, 1, find_result_ptr, stream); - } else { - find_all_from_set(h_data, chars_to_find, 1, find_result_ptr, stream); - } - - // Previous call stores the record positions as encountered by all threads - // Sort the record positions as subsequent processing may require filtering - // certain rows or other processing on specific records - thrust::sort(rmm::exec_policy(stream), rec_starts.begin(), rec_starts.end()); - - auto filtered_count = prefilter_count; - - // Exclude the ending newline as it does not precede a record start - if (h_data.back() == '\n') { filtered_count--; } - rec_starts.resize(filtered_count, stream); - - return rec_starts; -} - -/** - * @brief Uploads the relevant segment of the input json data onto the GPU. - * - * Sets the d_data_ data member. - * Only rows that need to be parsed are copied, based on the byte range - * Also updates the array of record starts to match the device data offset. - */ -rmm::device_uvector upload_data_to_device(json_reader_options const& reader_opts, - host_span h_data, - rmm::device_uvector& rec_starts, - rmm::cuda_stream_view stream) -{ - CUDF_FUNC_RANGE(); - size_t end_offset = h_data.size(); - - // Trim lines that are outside range - auto h_rec_starts = cudf::detail::make_std_vector_sync(rec_starts, stream); - - if (reader_opts.get_byte_range_size() != 0) { - auto it = h_rec_starts.end() - 1; - while (it >= h_rec_starts.begin() && *it > reader_opts.get_byte_range_size()) { - end_offset = *it; - --it; - } - h_rec_starts.erase(it + 1, h_rec_starts.end()); - } - - // Resize to exclude rows outside of the range - // Adjust row start positions to account for the data subcopy - size_t start_offset = h_rec_starts.front(); - rec_starts.resize(h_rec_starts.size(), stream); - thrust::transform(rmm::exec_policy(stream), - rec_starts.begin(), - rec_starts.end(), - thrust::make_constant_iterator(start_offset), - rec_starts.begin(), - thrust::minus()); - - size_t const bytes_to_upload = end_offset - start_offset; - CUDF_EXPECTS(bytes_to_upload <= h_data.size(), - "Error finding the record within the specified byte range.\n"); - - // Upload the raw data that is within the rows of interest - return cudf::detail::make_device_uvector_async( - h_data.subspan(start_offset, bytes_to_upload), stream, rmm::mr::get_current_device_resource()); -} - -std::pair, col_map_ptr_type> get_column_names_and_map( - parse_options_view const& parse_opts, - host_span h_data, - device_span rec_starts, - device_span d_data, - rmm::cuda_stream_view stream) -{ - // If file only contains one row, use the file size for the row size - uint64_t first_row_len = d_data.size(); - if (rec_starts.size() > 1) { - // Set first_row_len to the offset of the second row, if it exists - CUDF_CUDA_TRY(cudaMemcpyAsync( - &first_row_len, rec_starts.data() + 1, sizeof(uint64_t), cudaMemcpyDefault, stream.value())); - } - std::vector first_row(first_row_len); - CUDF_CUDA_TRY(cudaMemcpyAsync(first_row.data(), - d_data.data(), - first_row_len * sizeof(char), - cudaMemcpyDefault, - stream.value())); - stream.synchronize(); - - // Determine the row format between: - // JSON array - [val1, val2, ...] and - // JSON object - {"col1":val1, "col2":val2, ...} - // based on the top level opening bracket - auto const first_square_bracket = std::find(first_row.begin(), first_row.end(), '['); - auto const first_curly_bracket = std::find(first_row.begin(), first_row.end(), '{'); - CUDF_EXPECTS(first_curly_bracket != first_row.end() || first_square_bracket != first_row.end(), - "Input data is not a valid JSON file."); - // If the first opening bracket is '{', assume object format - if (first_curly_bracket < first_square_bracket) { - // use keys as column names if input rows are objects - return get_json_object_keys_hashes(parse_opts, h_data, rec_starts, d_data, stream); - } else { - int cols_found = 0; - bool quotation = false; - auto column_names = std::vector(); - for (size_t pos = 0; pos < first_row.size(); ++pos) { - // Flip the quotation flag if current character is a quotechar - if (first_row[pos] == parse_opts.quotechar) { - quotation = !quotation; - } - // Check if end of a column/row - else if (pos == first_row.size() - 1 || - (!quotation && first_row[pos] == parse_opts.delimiter)) { - column_names.emplace_back(std::to_string(cols_found++)); - } - } - return {column_names, col_map_type::create(0, stream)}; - } -} - -std::vector get_data_types(json_reader_options const& reader_opts, - parse_options_view const& parse_opts, - std::vector const& column_names, - col_map_type* column_map, - device_span rec_starts, - device_span data, - rmm::cuda_stream_view stream) -{ - bool has_to_infer_column_types = - std::visit([](auto const& dtypes) { return dtypes.empty(); }, reader_opts.get_dtypes()); - - if (!has_to_infer_column_types) { - return std::visit( - cudf::detail::visitor_overload{ - [&](std::vector const& dtypes) { - CUDF_EXPECTS(dtypes.size() == column_names.size(), "Must specify types for all columns"); - return dtypes; - }, - [&](std::map const& dtypes) { - std::vector sorted_dtypes; - std::transform(std::cbegin(column_names), - std::cend(column_names), - std::back_inserter(sorted_dtypes), - [&](auto const& column_name) { - auto const it = dtypes.find(column_name); - CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); - return it->second; - }); - return sorted_dtypes; - }, - [&](std::map const& dtypes) { - std::vector sorted_dtypes; - std::transform(std::cbegin(column_names), - std::cend(column_names), - std::back_inserter(sorted_dtypes), - [&](auto const& column_name) { - auto const it = dtypes.find(column_name); - CUDF_EXPECTS(it != dtypes.end(), "Must specify types for all columns"); - return it->second.type; - }); - return sorted_dtypes; - }}, - reader_opts.get_dtypes()); - } else { - CUDF_EXPECTS(not rec_starts.empty(), "No data available for data type inference.\n"); - auto const num_columns = column_names.size(); - auto const do_set_null_count = column_map->capacity() > 0; - - auto const h_column_infos = detect_data_types( - parse_opts, data, rec_starts, do_set_null_count, num_columns, column_map, stream); - - auto get_type_id = [&](auto const& cinfo) { - auto int_count_total = - cinfo.big_int_count + cinfo.negative_small_int_count + cinfo.positive_small_int_count; - if (cinfo.null_count == static_cast(rec_starts.size())) { - // Entire column is NULL; allocate the smallest amount of memory - return type_id::INT8; - } else if (cinfo.string_count > 0) { - return type_id::STRING; - } else if (cinfo.datetime_count > 0) { - return type_id::TIMESTAMP_MILLISECONDS; - } else if (cinfo.float_count > 0) { - return type_id::FLOAT64; - } else if (cinfo.big_int_count == 0 && int_count_total != 0) { - return type_id::INT64; - } else if (cinfo.big_int_count != 0 && cinfo.negative_small_int_count != 0) { - return type_id::STRING; - } else if (cinfo.big_int_count != 0) { - return type_id::UINT64; - } else if (cinfo.bool_count > 0) { - return type_id::BOOL8; - } else { - CUDF_FAIL("Data type detection failed.\n"); - } - }; - - std::vector dtypes; - - std::transform(std::cbegin(h_column_infos), - std::cend(h_column_infos), - std::back_inserter(dtypes), - [&](auto const& cinfo) { return data_type{get_type_id(cinfo)}; }); - - return dtypes; - } -} - -table_with_metadata convert_data_to_table(parse_options_view const& parse_opts, - std::vector const& dtypes, - std::vector&& column_names, - col_map_type* column_map, - device_span rec_starts, - device_span data, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - auto const num_columns = dtypes.size(); - auto const num_records = rec_starts.size(); - - // alloc output buffers. - std::vector out_buffers; - for (size_t col = 0; col < num_columns; ++col) { - out_buffers.emplace_back(dtypes[col], num_records, true, stream, mr); - } - - thrust::host_vector h_dtypes(num_columns); - thrust::host_vector h_data(num_columns); - thrust::host_vector h_valid(num_columns); - - for (size_t i = 0; i < num_columns; ++i) { - h_dtypes[i] = dtypes[i]; - h_data[i] = out_buffers[i].data(); - h_valid[i] = out_buffers[i].null_mask(); - } - - auto d_dtypes = cudf::detail::make_device_uvector_async( - h_dtypes, stream, rmm::mr::get_current_device_resource()); - auto d_data = cudf::detail::make_device_uvector_async( - h_data, stream, rmm::mr::get_current_device_resource()); - auto d_valid = cudf::detail::make_device_uvector_async( - h_valid, stream, rmm::mr::get_current_device_resource()); - auto d_valid_counts = cudf::detail::make_zeroed_device_uvector_async( - num_columns, stream, rmm::mr::get_current_device_resource()); - - convert_json_to_columns( - parse_opts, data, rec_starts, d_dtypes, column_map, d_data, d_valid, d_valid_counts, stream); - - stream.synchronize(); - - // postprocess columns - auto target_chars = std::vector{'\\', '"', '\\', '\\', '\\', 't', '\\', 'r', '\\', 'b'}; - auto target_offsets = std::vector{0, 2, 4, 6, 8, 10}; - - auto repl_chars = std::vector{'"', '\\', '\t', '\r', '\b'}; - auto repl_offsets = std::vector{0, 1, 2, 3, 4, 5}; - - auto target = - make_strings_column(static_cast(target_offsets.size() - 1), - std::make_unique( - cudf::detail::make_device_uvector_async( - target_offsets, stream, rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0), - cudf::detail::make_device_uvector_async( - target_chars, stream, rmm::mr::get_current_device_resource()) - .release(), - 0, - {}); - auto repl = make_strings_column( - static_cast(repl_offsets.size() - 1), - std::make_unique(cudf::detail::make_device_uvector_async( - repl_offsets, stream, rmm::mr::get_current_device_resource()), - rmm::device_buffer{}, - 0), - cudf::detail::make_device_uvector_async( - repl_chars, stream, rmm::mr::get_current_device_resource()) - .release(), - 0, - {}); - - auto const h_valid_counts = cudf::detail::make_std_vector_sync(d_valid_counts, stream); - std::vector> out_columns; - for (size_t i = 0; i < num_columns; ++i) { - out_buffers[i].null_count() = num_records - h_valid_counts[i]; - - auto out_column = make_column(out_buffers[i], nullptr, std::nullopt, stream); - if (out_column->type().id() == type_id::STRING) { - // Need to remove escape character in case of '\"' and '\\' - out_columns.emplace_back(cudf::strings::detail::replace( - out_column->view(), target->view(), repl->view(), stream, mr)); - } else { - out_columns.emplace_back(std::move(out_column)); - } - if (out_columns.back()->null_count() == 0) { - out_columns.back()->set_null_mask(rmm::device_buffer{0, stream, mr}, 0); - } - } - - std::vector column_infos; - column_infos.reserve(column_names.size()); - std::transform(std::make_move_iterator(column_names.begin()), - std::make_move_iterator(column_names.end()), - std::back_inserter(column_infos), - [](auto const& col_name) { return column_name_info{col_name}; }); - - // This is to ensure the stream-ordered make_stream_column calls above complete before - // the temporary std::vectors are destroyed on exit from this function. - stream.synchronize(); - - CUDF_EXPECTS(!out_columns.empty(), "No columns created from json input"); - - return table_with_metadata{std::make_unique
(std::move(out_columns)), {column_infos}}; -} - -/** - * @brief Read an entire set or a subset of data from the source - * - * @param[in] options reader options with Number of bytes offset from the start, - * Bytes to read; use `0` for all remaining data - * @param[in] stream CUDA stream used for device memory operations and kernel launches. - * - * @return Table and its metadata - */ -table_with_metadata read_json(host_span> sources, - json_reader_options const& reader_opts, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) -{ - CUDF_EXPECTS(not sources.empty(), "No sources were defined"); - CUDF_EXPECTS(sources.size() == 1 or reader_opts.get_compression() == compression_type::NONE, - "Multiple compressed inputs are not supported"); - CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Only JSON Lines format is currently supported.\n"); - - auto parse_opts = parse_options{',', '\n', '\"', '.'}; - - parse_opts.trie_true = cudf::detail::create_serialized_trie({"true"}, stream); - parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream); - parse_opts.trie_na = cudf::detail::create_serialized_trie({"", "null"}, stream); - - parse_opts.dayfirst = reader_opts.is_enabled_dayfirst(); - - auto range_offset = reader_opts.get_byte_range_offset(); - auto range_size = reader_opts.get_byte_range_size(); - auto range_size_padded = reader_opts.get_byte_range_size_with_padding(); - - auto const h_raw_data = ingest_raw_input( - sources, reader_opts.get_compression(), range_offset, range_size, range_size_padded); - host_span h_data{reinterpret_cast(h_raw_data.data()), h_raw_data.size()}; - - CUDF_EXPECTS(not h_data.empty(), "Ingest failed: uncompressed input data has zero size.\n"); - - auto d_data = rmm::device_uvector(0, stream); - - if (should_load_whole_source(reader_opts)) { - d_data = cudf::detail::make_device_uvector_async( - h_data, stream, rmm::mr::get_current_device_resource()); - } - - auto rec_starts = find_record_starts(reader_opts, h_data, d_data, stream); - - CUDF_EXPECTS(rec_starts.size() > 0, "Error enumerating records.\n"); - - if (not should_load_whole_source(reader_opts)) { - d_data = upload_data_to_device(reader_opts, h_data, rec_starts, stream); - } - - CUDF_EXPECTS(not d_data.is_empty(), "Error uploading input data to the GPU.\n"); - - auto column_names_and_map = - get_column_names_and_map(parse_opts.view(), h_data, rec_starts, d_data, stream); - - auto column_names = std::get<0>(column_names_and_map); - auto column_map = std::move(std::get<1>(column_names_and_map)); - - CUDF_EXPECTS(not column_names.empty(), "Error determining column names.\n"); - - auto dtypes = get_data_types( - reader_opts, parse_opts.view(), column_names, column_map.get(), rec_starts, d_data, stream); - - CUDF_EXPECTS(not dtypes.empty(), "Error in data type detection.\n"); - - return convert_data_to_table(parse_opts.view(), - dtypes, - std::move(column_names), - column_map.get(), - rec_starts, - d_data, - stream, - mr); -} - -} // namespace cudf::io::json::detail::legacy diff --git a/cpp/src/io/json/read_json.cu b/cpp/src/io/json/read_json.cu index ea52dce020e..df5c7bc21e1 100644 --- a/cpp/src/io/json/read_json.cu +++ b/cpp/src/io/json/read_json.cu @@ -15,7 +15,6 @@ */ #include "io/comp/io_uncomp.hpp" -#include "io/json/legacy/read_json.hpp" #include "io/json/nested_json.hpp" #include "read_json.hpp" @@ -267,14 +266,6 @@ table_with_metadata read_json(host_span> sources, { CUDF_FUNC_RANGE(); - // TODO remove this if-statement once legacy is removed -#pragma GCC diagnostic push -#pragma GCC diagnostic ignored "-Wdeprecated-declarations" - if (reader_opts.is_enabled_legacy()) { - return legacy::read_json(sources, reader_opts, stream, mr); - } -#pragma GCC diagnostic pop - if (reader_opts.get_byte_range_offset() != 0 or reader_opts.get_byte_range_size() != 0) { CUDF_EXPECTS(reader_opts.is_enabled_lines(), "Specifying a byte range is supported only for JSON Lines"); diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index db934818ae7..2b8c1b02b40 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -189,10 +189,6 @@ ConfigureTest( PERCENT 70 ) -# ################################################################################################## -# * hash_map tests -------------------------------------------------------------------------------- -ConfigureTest(HASH_MAP_TEST hash_map/map_test.cu) - # ################################################################################################## # * quantiles tests ------------------------------------------------------------------------------- ConfigureTest( diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu deleted file mode 100644 index 4b10716706b..00000000000 --- a/cpp/tests/hash_map/map_test.cu +++ /dev/null @@ -1,217 +0,0 @@ -/* - * Copyright (c) 2018-2024, NVIDIA CORPORATION. - * - * Licensed under the Apache License, Version 2.0 (the "License"); - * you may not use this file except in compliance with the License. - * You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -#include "hash/concurrent_unordered_map.cuh" - -#include -#include -#include - -#include -#include - -#include -#include - -#include -#include -#include - -#include -#include -#include -#include -#include -#include - -template -struct key_value_types { - using key_type = K; - using value_type = V; - using pair_type = thrust::pair; - using map_type = concurrent_unordered_map; -}; - -template -struct InsertTest : public cudf::test::BaseFixture { - using key_type = typename T::key_type; - using value_type = typename T::value_type; - using pair_type = typename T::pair_type; - using map_type = typename T::map_type; - - InsertTest() - { - // prevent overflow of small types - const size_t input_size = - std::min(static_cast(size), std::numeric_limits::max()); - pairs.resize(input_size, cudf::get_default_stream()); - map = std::move(map_type::create(compute_hash_table_size(size), cudf::get_default_stream())); - cudf::get_default_stream().synchronize(); - } - - const cudf::size_type size{10000}; - rmm::device_uvector pairs{static_cast(size), cudf::get_default_stream()}; - std::unique_ptr> map; -}; - -using TestTypes = ::testing::Types, - key_value_types, - key_value_types, - key_value_types, - key_value_types>; - -TYPED_TEST_SUITE(InsertTest, TestTypes); - -template -struct insert_pair { - insert_pair(map_type _map) : map{_map} {} - - __device__ bool operator()(pair_type const& pair) - { - auto result = map.insert(pair); - if (result.first == map.end()) { return false; } - return result.second; - } - - map_type map; -}; - -template -struct find_pair { - find_pair(map_type _map) : map{_map} {} - - __device__ bool operator()(pair_type const& pair) - { - auto result = map.find(pair.first); - if (result == map.end()) { return false; } - return *result == pair; - } - map_type map; -}; - -template -struct unique_pair_generator { - __device__ pair_type operator()(cudf::size_type i) - { - return thrust::make_pair(key_type(i), value_type(i)); - } -}; - -template -struct identical_pair_generator { - identical_pair_generator(key_type k = 42, value_type v = 42) : key{k}, value{v} {} - __device__ pair_type operator()(cudf::size_type i) { return thrust::make_pair(key, value); } - key_type key; - value_type value; -}; - -template -struct identical_key_generator { - identical_key_generator(key_type k = 42) : key{k} {} - __device__ pair_type operator()(cudf::size_type i) - { - return thrust::make_pair(key, value_type(i)); - } - key_type key; -}; - -TYPED_TEST(InsertTest, UniqueKeysUniqueValues) -{ - using map_type = typename TypeParam::map_type; - using pair_type = typename TypeParam::pair_type; - thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - unique_pair_generator{}); - // All pairs should be new inserts - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - insert_pair{*this->map})); - - // All pairs should be present in the map - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - find_pair{*this->map})); -} - -TYPED_TEST(InsertTest, IdenticalKeysIdenticalValues) -{ - using map_type = typename TypeParam::map_type; - using pair_type = typename TypeParam::pair_type; - thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - identical_pair_generator{}); - // Insert a single pair - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.begin() + 1, - insert_pair{*this->map})); - // Identical inserts should all return false (no new insert) - EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - insert_pair{*this->map})); - - // All pairs should be present in the map - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - find_pair{*this->map})); -} - -TYPED_TEST(InsertTest, IdenticalKeysUniqueValues) -{ - using map_type = typename TypeParam::map_type; - using pair_type = typename TypeParam::pair_type; - thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.end(), - identical_key_generator{}); - - // Insert a single pair - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.begin() + 1, - insert_pair{*this->map})); - - // Identical key inserts should all return false (no new insert) - EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin() + 1, - this->pairs.end(), - insert_pair{*this->map})); - - // Only first pair is present in map - EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin(), - this->pairs.begin() + 1, - find_pair{*this->map})); - - EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()), - this->pairs.begin() + 1, - this->pairs.end(), - find_pair{*this->map})); -} - -CUDF_TEST_PROGRAM_MAIN() diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp index 35e6adf20e7..9d766e80094 100644 --- a/cpp/tests/io/json_test.cpp +++ b/cpp/tests/io/json_test.cpp @@ -264,13 +264,13 @@ struct JsonValidFixedPointReaderTest : public JsonFixedPointReaderTestget_column(1), float64_wrapper{{1.1, 2.2, 3.3, 4.4}}); } -// This can be removed once the legacy option has been removed. -// The read_json only throws with legacy(true) -TEST_F(JsonReaderTest, DISABLED_BadDtypeParams) -{ - std::string buffer = "[1,2,3,4]"; - - cudf::io::json_reader_options options_vec = - cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .lines(true) - .dtypes({dtype()}); - - // should throw because there are four columns and only one dtype - EXPECT_THROW(cudf::io::read_json(options_vec), cudf::logic_error); - - cudf::io::json_reader_options options_map = - cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()}) - .lines(true) - .dtypes(std::map{{"0", dtype()}, - {"1", dtype()}, - {"2", dtype()}, - {"wrong_name", dtype()}}); - // should throw because one of the columns is not in the dtype map - EXPECT_THROW(cudf::io::read_json(options_map), cudf::logic_error); -} - TEST_F(JsonReaderTest, JsonBasic) { std::string const fname = temp_env->get_temp_dir() + "JsonBasic.json"; @@ -1372,12 +1345,8 @@ TEST_F(JsonReaderTest, JsonLines) // Read test data via nested JSON reader auto const table = cudf::io::read_json(json_lines_options); - // Read test data via legacy, non-nested JSON lines reader - auto const legacy_reader_table = cudf::io::read_json(json_lines_options); - - // Verify that the data read via non-nested JSON lines reader matches the data read via nested - // JSON reader - CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view()); + // TODO: Rewrite this test to check against a fixed value + CUDF_TEST_EXPECT_TABLES_EQUAL(table.tbl->view(), table.tbl->view()); } TEST_F(JsonReaderTest, JsonLongString) @@ -1548,12 +1517,8 @@ TEST_F(JsonReaderTest, LinesNoOmissions) // Read test data via nested JSON reader auto const table = cudf::io::read_json(json_lines_options); - // Read test data via legacy, non-nested JSON lines reader - auto const legacy_reader_table = cudf::io::read_json(json_lines_options); - - // Verify that the data read via non-nested JSON lines reader matches the data read via - // nested JSON reader - CUDF_TEST_EXPECT_TABLES_EQUAL(legacy_reader_table.tbl->view(), table.tbl->view()); + // TODO: Rewrite this test to check against a fixed value + CUDF_TEST_EXPECT_TABLES_EQUAL(table.tbl->view(), table.tbl->view()); } } @@ -2440,7 +2405,7 @@ TEST_F(JsonReaderTest, MapTypes) struct JsonDelimiterParamTest : public cudf::test::BaseFixture, public testing::WithParamInterface {}; -// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader +// Parametrize qualifying JSON tests for multiple delimiters INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest, JsonDelimiterParamTest, ::testing::Values('\n', '\b', '\v', '\f', 'h')); diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp index d6f800cce8b..5dc25133719 100644 --- a/cpp/tests/io/nested_json_test.cpp +++ b/cpp/tests/io/nested_json_test.cpp @@ -248,7 +248,7 @@ TEST_F(JsonTest, StackContextUtf8) struct JsonDelimiterParamTest : public cudf::test::BaseFixture, public testing::WithParamInterface {}; -// Parametrize qualifying JSON tests for executing both nested reader and legacy JSON lines reader +// Parametrize qualifying JSON tests for multiple delimiters INSTANTIATE_TEST_SUITE_P(JsonDelimiterParamTest, JsonDelimiterParamTest, ::testing::Values('\n', '\b', '\v', '\f', 'h')); diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx index 283a451dd4a..242727163ee 100644 --- a/python/cudf/cudf/_lib/json.pyx +++ b/python/cudf/cudf/_lib/json.pyx @@ -47,7 +47,6 @@ cpdef read_json(object filepaths_or_buffers, bool lines, object compression, object byte_range, - bool legacy, bool keep_quotes, bool mixed_types_as_string, bool prune_columns): @@ -119,7 +118,6 @@ cpdef read_json(object filepaths_or_buffers, .lines(c_lines) .byte_range_offset(c_range_offset) .byte_range_size(c_range_size) - .legacy(legacy) .build() ) if is_list_like_dtypes: diff --git a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd index 7e64a4cae29..10e43467d57 100644 --- a/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd +++ b/python/cudf/cudf/_lib/pylibcudf/libcudf/io/json.pxd @@ -87,9 +87,6 @@ cdef extern from "cudf/io/json.hpp" \ json_reader_options_builder& dayfirst( bool val ) except + - json_reader_options_builder& legacy( - bool val - ) except + json_reader_options_builder& keep_quotes( bool val ) except + diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py index 03d07fc3a50..7de9705e4cb 100644 --- a/python/cudf/cudf/io/json.py +++ b/python/cudf/cudf/io/json.py @@ -99,7 +99,6 @@ def read_json( lines, compression, byte_range, - False, keep_quotes, mixed_types_as_string, prune_columns,