diff --git a/.github/workflows/pr.yaml b/.github/workflows/pr.yaml index b4c449ce5d8..766df59594b 100644 --- a/.github/workflows/pr.yaml +++ b/.github/workflows/pr.yaml @@ -50,6 +50,7 @@ jobs: test_java: ${{ steps.changed-files.outputs.java_any_changed == 'true' }} test_notebooks: ${{ steps.changed-files.outputs.notebooks_any_changed == 'true' }} test_python: ${{ steps.changed-files.outputs.python_any_changed == 'true' }} + test_cudf_pandas: ${{ steps.changed-files.outputs.cudf_pandas_any_changed == 'true' }} steps: - name: Get PR info id: get-pr-info @@ -82,6 +83,7 @@ jobs: - '!java/**' - '!notebooks/**' - '!python/**' + - '!ci/cudf_pandas_scripts/**' java: - '**' - '!CONTRIBUTING.md' @@ -90,11 +92,13 @@ jobs: - '!img/**' - '!notebooks/**' - '!python/**' + - '!ci/cudf_pandas_scripts/**' notebooks: - '**' - '!CONTRIBUTING.md' - '!README.md' - '!java/**' + - '!ci/cudf_pandas_scripts/**' python: - '**' - '!CONTRIBUTING.md' @@ -103,6 +107,16 @@ jobs: - '!img/**' - '!java/**' - '!notebooks/**' + - '!ci/cudf_pandas_scripts/**' + cudf_pandas: + - '**' + - 'ci/cudf_pandas_scripts/**' + - '!CONTRIBUTING.md' + - '!README.md' + - '!docs/**' + - '!img/**' + - '!java/**' + - '!notebooks/**' checks: secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/checks.yaml@branch-24.10 @@ -289,7 +303,7 @@ jobs: needs: [wheel-build-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 - if: needs.changed-files.outputs.test_python == 'true' + if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) @@ -300,7 +314,7 @@ jobs: needs: [wheel-build-cudf, changed-files] secrets: inherit uses: rapidsai/shared-workflows/.github/workflows/wheels-test.yaml@branch-24.10 - if: needs.changed-files.outputs.test_python == 'true' + if: needs.changed-files.outputs.test_python == 'true' || needs.changed-files.outputs.test_cudf_pandas == 'true' with: # This selects "ARCH=amd64 + the latest supported Python + CUDA". matrix_filter: map(select(.ARCH == "amd64")) | group_by(.CUDA_VER|split(".")|map(tonumber)|.[0]) | map(max_by([(.PY_VER|split(".")|map(tonumber)), (.CUDA_VER|split(".")|map(tonumber))])) diff --git a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py index 7a12db927e5..485b2ac8a51 100644 --- a/ci/cudf_pandas_scripts/pandas-tests/job-summary.py +++ b/ci/cudf_pandas_scripts/pandas-tests/job-summary.py @@ -67,20 +67,33 @@ def emoji_failed(x): # convert pr_results to a pandas DataFrame and then a markdown table pr_df = pd.DataFrame.from_dict(pr_results, orient="index").sort_index() main_df = pd.DataFrame.from_dict(main_results, orient="index").sort_index() -diff_df = pr_df - main_df -total_usage = pr_df['_slow_function_call'] + pr_df['_fast_function_call'] -pr_df['CPU Usage'] = ((pr_df['_slow_function_call']/total_usage)*100.0).round(1) -pr_df['GPU Usage'] = ((pr_df['_fast_function_call']/total_usage)*100.0).round(1) +total_usage = main_df["_slow_function_call"] + main_df["_fast_function_call"] +main_df["CPU Usage"] = ((main_df["_slow_function_call"] / total_usage) * 100.0).round(1) +main_df["GPU Usage"] = ((main_df["_fast_function_call"] / total_usage) * 100.0).round(1) + +total_usage = pr_df["_slow_function_call"] + pr_df["_fast_function_call"] +pr_df["CPU Usage"] = ((pr_df["_slow_function_call"] / total_usage) * 100.0).round(1) +pr_df["GPU Usage"] = ((pr_df["_fast_function_call"] / total_usage) * 100.0).round(1) + +cpu_usage_mean = pr_df["CPU Usage"].mean().round(2) +gpu_usage_mean = pr_df["GPU Usage"].mean().round(2) + +gpu_usage_rate_change = abs(pr_df["GPU Usage"].mean() - main_df["GPU Usage"].mean()) +pr_df["CPU Usage"] = pr_df["CPU Usage"].fillna(0) +pr_df["GPU Usage"] = pr_df["GPU Usage"].fillna(0) +main_df["CPU Usage"] = main_df["CPU Usage"].fillna(0) +main_df["GPU Usage"] = main_df["GPU Usage"].fillna(0) -cpu_usage_mean = pr_df['CPU Usage'].mean().round(2) -gpu_usage_mean = pr_df['GPU Usage'].mean().round(2) +diff_df = pr_df - main_df +diff_df["CPU Usage"] = diff_df["CPU Usage"].round(1).fillna(0) +diff_df["GPU Usage"] = diff_df["GPU Usage"].round(1).fillna(0) -# Add '%' suffix to 'CPU Usage' and 'GPU Usage' columns -pr_df['CPU Usage'] = pr_df['CPU Usage'].fillna(0).astype(str) + '%' -pr_df['GPU Usage'] = pr_df['GPU Usage'].fillna(0).astype(str) + '%' +# Add '%' suffix to "CPU Usage" and "GPU Usage" columns +pr_df["CPU Usage"] = pr_df["CPU Usage"].astype(str) + "%" +pr_df["GPU Usage"] = pr_df["GPU Usage"].astype(str) + "%" -pr_df = pr_df[["total", "passed", "failed", "skipped", 'CPU Usage', 'GPU Usage']] -diff_df = diff_df[["total", "passed", "failed", "skipped"]] +pr_df = pr_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]] +diff_df = diff_df[["total", "passed", "failed", "skipped", "CPU Usage", "GPU Usage"]] diff_df.columns = diff_df.columns + "_diff" diff_df["passed_diff"] = diff_df["passed_diff"].map(emoji_passed) diff_df["failed_diff"] = diff_df["failed_diff"].map(emoji_failed) @@ -99,13 +112,36 @@ def emoji_failed(x): "passed_diff": "Passed delta", "failed_diff": "Failed delta", "skipped_diff": "Skipped delta", + "CPU Usage_diff": "CPU Usage delta", + "GPU Usage_diff": "GPU Usage delta", } ) -df = df.sort_values(by=["Failed tests", "Skipped tests"], ascending=False) - +df = df.sort_values(by=["CPU Usage delta", "Total tests"], ascending=False) +df["CPU Usage delta"] = df["CPU Usage delta"].map(emoji_failed) +df["GPU Usage delta"] = df["GPU Usage delta"].map(emoji_passed) +df = df[ + [ + "Total tests", + "CPU Usage delta", + "GPU Usage delta", + "Passed tests", + "Failed tests", + "Skipped tests", + "CPU Usage", + "GPU Usage", + "Total delta", + "Passed delta", + "Failed delta", + "Skipped delta", + ] +] print(comment) print() -print(f"Average CPU and GPU usage for the tests: {cpu_usage_mean}% and {gpu_usage_mean}%") +print( + f"Average GPU usage: {gpu_usage_mean}% {'an increase' if gpu_usage_rate_change > 0 else 'a decrease'} by {gpu_usage_rate_change}%" +) +print() +print(f"Average CPU usage: {cpu_usage_mean}%") print() print("Here are the results of running the Pandas tests against this PR:") print() diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 16b3d112992..5a05dfd0530 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -65,6 +65,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.3dev0 - pandoc +- polars>=1.8,<1.9 - pre-commit - ptxcompiler - pyarrow>=14.0.0,<18.0.0a0 diff --git a/conda/environments/all_cuda-125_arch-x86_64.yaml b/conda/environments/all_cuda-125_arch-x86_64.yaml index cce2e0eea84..8490296233d 100644 --- a/conda/environments/all_cuda-125_arch-x86_64.yaml +++ b/conda/environments/all_cuda-125_arch-x86_64.yaml @@ -63,6 +63,7 @@ dependencies: - pandas - pandas>=2.0,<2.2.3dev0 - pandoc +- polars>=1.8,<1.9 - pre-commit - pyarrow>=14.0.0,<18.0.0a0 - pydata-sphinx-theme!=0.14.2 diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 84b462bb884..136f43ee706 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -380,6 +380,7 @@ add_library( src/io/functions.cpp src/io/json/host_tree_algorithms.cu src/io/json/json_column.cu + src/io/json/column_tree_construction.cu src/io/json/json_normalization.cu src/io/json/json_tree.cu src/io/json/nested_json_gpu.cu diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt index abc6f74fccf..4113e38dcf4 100644 --- a/cpp/benchmarks/CMakeLists.txt +++ b/cpp/benchmarks/CMakeLists.txt @@ -230,6 +230,11 @@ ConfigureNVBench(STRUCT_CREATION_NVBENCH structs/create_structs.cpp) # -------------------------------------------------------------------------------- ConfigureBench(QUANTILES_BENCH quantiles/quantiles.cpp) +# ################################################################################################## +# * tdigest benchmark +# -------------------------------------------------------------------------------- +ConfigureNVBench(TDIGEST_NVBENCH quantiles/tdigest.cu) + # ################################################################################################## # * type_dispatcher benchmark --------------------------------------------------------------------- ConfigureBench(TYPE_DISPATCHER_BENCH type_dispatcher/type_dispatcher.cu) diff --git a/cpp/benchmarks/quantiles/tdigest.cu b/cpp/benchmarks/quantiles/tdigest.cu new file mode 100644 index 00000000000..9d37dbc9a26 --- /dev/null +++ b/cpp/benchmarks/quantiles/tdigest.cu @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include + +#include +#include + +#include + +#include +#include +#include + +#include + +void bm_tdigest_merge(nvbench::state& state) +{ + auto const num_tdigests = static_cast(state.get_int64("num_tdigests")); + auto const tdigest_size = static_cast(state.get_int64("tdigest_size")); + auto const tdigests_per_group = + static_cast(state.get_int64("tdigests_per_group")); + auto const max_centroids = static_cast(state.get_int64("max_centroids")); + auto const num_groups = num_tdigests / tdigests_per_group; + auto const total_centroids = num_tdigests * tdigest_size; + + auto stream = cudf::get_default_stream(); + auto mr = rmm::mr::get_current_device_resource(); + + constexpr int base_value = 5; + + // construct inner means/weights + auto val_iter = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([tdigest_size](cudf::size_type i) { + return static_cast(base_value + (i % tdigest_size)); + })); + auto one_iter = thrust::make_constant_iterator(1); + cudf::test::fixed_width_column_wrapper means(val_iter, val_iter + total_centroids); + cudf::test::fixed_width_column_wrapper weights(one_iter, one_iter + total_centroids); + std::vector> inner_struct_children; + inner_struct_children.push_back(means.release()); + inner_struct_children.push_back(weights.release()); + cudf::test::structs_column_wrapper inner_struct(std::move(inner_struct_children)); + + // construct the tdigest lists themselves + auto offset_iter = cudf::detail::make_counting_transform_iterator( + 0, cuda::proclaim_return_type([tdigest_size](cudf::size_type i) { + return i * tdigest_size; + })); + cudf::test::fixed_width_column_wrapper offsets(offset_iter, offset_iter + num_tdigests + 1); + auto list_col = cudf::make_lists_column( + num_tdigests, offsets.release(), inner_struct.release(), 0, {}, stream, mr); + + // min and max columns + auto min_iter = thrust::make_constant_iterator(base_value); + auto max_iter = thrust::make_constant_iterator(base_value + (tdigest_size - 1)); + cudf::test::fixed_width_column_wrapper mins(min_iter, min_iter + num_tdigests); + cudf::test::fixed_width_column_wrapper maxes(max_iter, max_iter + num_tdigests); + + // assemble the whole thing + std::vector> tdigest_children; + tdigest_children.push_back(std::move(list_col)); + tdigest_children.push_back(mins.release()); + tdigest_children.push_back(maxes.release()); + cudf::test::structs_column_wrapper tdigest(std::move(tdigest_children)); + + rmm::device_uvector group_offsets(num_groups + 1, stream, mr); + rmm::device_uvector group_labels(num_tdigests, stream, mr); + auto group_offset_iter = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [tdigests_per_group] __device__(cudf::size_type i) { return i * tdigests_per_group; })); + thrust::copy(rmm::exec_policy_nosync(stream, mr), + group_offset_iter, + group_offset_iter + num_groups + 1, + group_offsets.begin()); + auto group_label_iter = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [tdigests_per_group] __device__(cudf::size_type i) { return i / tdigests_per_group; })); + thrust::copy(rmm::exec_policy_nosync(stream, mr), + group_label_iter, + group_label_iter + num_tdigests, + group_labels.begin()); + + state.add_element_count(total_centroids); + + state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value())); + state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync, + [&](nvbench::launch& launch, auto& timer) { + timer.start(); + auto result = cudf::tdigest::detail::group_merge_tdigest( + tdigest, group_offsets, group_labels, num_groups, max_centroids, stream, mr); + timer.stop(); + }); +} + +NVBENCH_BENCH(bm_tdigest_merge) + .set_name("TDigest many tiny groups") + .add_int64_axis("num_tdigests", {500'000}) + .add_int64_axis("tdigest_size", {1, 1000}) + .add_int64_axis("tdigests_per_group", {1}) + .add_int64_axis("max_centroids", {10000, 1000}); + +NVBENCH_BENCH(bm_tdigest_merge) + .set_name("TDigest many small groups") + .add_int64_axis("num_tdigests", {500'000}) + .add_int64_axis("tdigest_size", {1, 1000}) + .add_int64_axis("tdigests_per_group", {3}) + .add_int64_axis("max_centroids", {10000, 1000}); diff --git a/cpp/include/cudf/io/json.hpp b/cpp/include/cudf/io/json.hpp index ff25a5bacae..6798557e14e 100644 --- a/cpp/include/cudf/io/json.hpp +++ b/cpp/include/cudf/io/json.hpp @@ -105,6 +105,8 @@ class json_reader_options { char _delimiter = '\n'; // Prune columns on read, selected based on the _dtypes option bool _prune_columns = false; + // Experimental features: new column tree construction + bool _experimental = false; // Bytes to skip from the start size_t _byte_range_offset = 0; @@ -277,6 +279,15 @@ class json_reader_options { */ [[nodiscard]] bool is_enabled_prune_columns() const { return _prune_columns; } + /** + * @brief Whether to enable experimental features. + * + * When set to true, experimental features, such as the new column tree construction, + * utf-8 matching of field names will be enabled. + * @return true if experimental features are enabled + */ + [[nodiscard]] bool is_enabled_experimental() const { return _experimental; } + /** * @brief Whether to parse dates as DD/MM versus MM/DD. * @@ -453,6 +464,16 @@ class json_reader_options { */ void enable_prune_columns(bool val) { _prune_columns = val; } + /** + * @brief Set whether to enable experimental features. + * + * When set to true, experimental features, such as the new column tree construction, + * utf-8 matching of field names will be enabled. + * + * @param val Boolean value to enable/disable experimental features + */ + void enable_experimental(bool val) { _experimental = val; } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * @@ -695,6 +716,21 @@ class json_reader_options_builder { return *this; } + /** + * @brief Set whether to enable experimental features. + * + * When set to true, experimental features, such as the new column tree construction, + * utf-8 matching of field names will be enabled. + * + * @param val Boolean value to enable/disable experimental features + * @return this for chaining + */ + json_reader_options_builder& experimental(bool val) + { + options._experimental = val; + return *this; + } + /** * @brief Set whether to parse dates as DD/MM versus MM/DD. * diff --git a/cpp/src/io/json/column_tree_construction.cu b/cpp/src/io/json/column_tree_construction.cu new file mode 100644 index 00000000000..c4fe7926706 --- /dev/null +++ b/cpp/src/io/json/column_tree_construction.cu @@ -0,0 +1,304 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "nested_json.hpp" + +#include +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace cudf::io::json { + +using row_offset_t = size_type; + +#ifdef CSR_DEBUG_PRINT +template +void print(device_span d_vec, std::string name, rmm::cuda_stream_view stream) +{ + stream.synchronize(); + auto h_vec = cudf::detail::make_std_vector_sync(d_vec, stream); + std::cout << name << " = "; + for (auto e : h_vec) { + std::cout << e << " "; + } + std::cout << std::endl; +} +#endif + +namespace experimental::detail { + +struct level_ordering { + device_span node_levels; + device_span col_ids; + device_span parent_node_ids; + __device__ bool operator()(NodeIndexT lhs_node_id, NodeIndexT rhs_node_id) const + { + auto lhs_parent_col_id = parent_node_ids[lhs_node_id] == parent_node_sentinel + ? parent_node_sentinel + : col_ids[parent_node_ids[lhs_node_id]]; + auto rhs_parent_col_id = parent_node_ids[rhs_node_id] == parent_node_sentinel + ? parent_node_sentinel + : col_ids[parent_node_ids[rhs_node_id]]; + + return (node_levels[lhs_node_id] < node_levels[rhs_node_id]) || + (node_levels[lhs_node_id] == node_levels[rhs_node_id] && + lhs_parent_col_id < rhs_parent_col_id) || + (node_levels[lhs_node_id] == node_levels[rhs_node_id] && + lhs_parent_col_id == rhs_parent_col_id && col_ids[lhs_node_id] < col_ids[rhs_node_id]); + } +}; + +struct parent_nodeids_to_colids { + device_span rev_mapped_col_ids; + __device__ auto operator()(NodeIndexT parent_node_id) -> NodeIndexT + { + return parent_node_id == parent_node_sentinel ? parent_node_sentinel + : rev_mapped_col_ids[parent_node_id]; + } +}; + +/** + * @brief Reduces node tree representation to column tree CSR representation. + * + * @param node_tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return A tuple of column tree representation of JSON string, column ids of columns, and + * max row offsets of columns + */ +std::tuple reduce_to_column_tree( + tree_meta_t& node_tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT row_array_parent_col_id, + rmm::cuda_stream_view stream) +{ + CUDF_FUNC_RANGE(); + + if (original_col_ids.empty()) { + rmm::device_uvector empty_row_idx(0, stream); + rmm::device_uvector empty_col_idx(0, stream); + rmm::device_uvector empty_column_categories(0, stream); + rmm::device_uvector empty_max_row_offsets(0, stream); + rmm::device_uvector empty_mapped_col_ids(0, stream); + return std::tuple{compressed_sparse_row{std::move(empty_row_idx), std::move(empty_col_idx)}, + column_tree_properties{std::move(empty_column_categories), + std::move(empty_max_row_offsets), + std::move(empty_mapped_col_ids)}}; + } + + auto [unpermuted_tree, unpermuted_col_ids, unpermuted_max_row_offsets] = + cudf::io::json::detail::reduce_to_column_tree(node_tree, + original_col_ids, + sorted_col_ids, + ordered_node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + + NodeIndexT num_columns = unpermuted_col_ids.size(); + + auto mapped_col_ids = cudf::detail::make_device_uvector_async( + unpermuted_col_ids, stream, cudf::get_current_device_resource_ref()); + rmm::device_uvector rev_mapped_col_ids(num_columns, stream); + rmm::device_uvector reordering_index(unpermuted_col_ids.size(), stream); + + thrust::sequence( + rmm::exec_policy_nosync(stream), reordering_index.begin(), reordering_index.end()); + // Reorder nodes and column ids in level-wise fashion + thrust::sort_by_key( + rmm::exec_policy_nosync(stream), + reordering_index.begin(), + reordering_index.end(), + mapped_col_ids.begin(), + level_ordering{ + unpermuted_tree.node_levels, unpermuted_col_ids, unpermuted_tree.parent_node_ids}); + + { + auto mapped_col_ids_copy = cudf::detail::make_device_uvector_async( + mapped_col_ids, stream, cudf::get_current_device_resource_ref()); + thrust::sequence( + rmm::exec_policy_nosync(stream), rev_mapped_col_ids.begin(), rev_mapped_col_ids.end()); + thrust::sort_by_key(rmm::exec_policy_nosync(stream), + mapped_col_ids_copy.begin(), + mapped_col_ids_copy.end(), + rev_mapped_col_ids.begin()); + } + + rmm::device_uvector parent_col_ids(num_columns, stream); + thrust::transform_output_iterator parent_col_ids_it(parent_col_ids.begin(), + parent_nodeids_to_colids{rev_mapped_col_ids}); + rmm::device_uvector max_row_offsets(num_columns, stream); + rmm::device_uvector column_categories(num_columns, stream); + thrust::copy_n( + rmm::exec_policy_nosync(stream), + thrust::make_zip_iterator(thrust::make_permutation_iterator( + unpermuted_tree.parent_node_ids.begin(), reordering_index.begin()), + thrust::make_permutation_iterator(unpermuted_max_row_offsets.begin(), + reordering_index.begin()), + thrust::make_permutation_iterator( + unpermuted_tree.node_categories.begin(), reordering_index.begin())), + num_columns, + thrust::make_zip_iterator( + parent_col_ids_it, max_row_offsets.begin(), column_categories.begin())); + +#ifdef CSR_DEBUG_PRINT + print(reordering_index, "h_reordering_index", stream); + print(mapped_col_ids, "h_mapped_col_ids", stream); + print(rev_mapped_col_ids, "h_rev_mapped_col_ids", stream); + print(parent_col_ids, "h_parent_col_ids", stream); + print(max_row_offsets, "h_max_row_offsets", stream); +#endif + + auto construct_row_idx = [&stream](NodeIndexT num_columns, + device_span parent_col_ids) { + auto row_idx = cudf::detail::make_zeroed_device_uvector_async( + static_cast(num_columns + 1), stream, cudf::get_current_device_resource_ref()); + // Note that the first element of csr_parent_col_ids is -1 (parent_node_sentinel) + // children adjacency + + auto num_non_leaf_columns = thrust::unique_count( + rmm::exec_policy_nosync(stream), parent_col_ids.begin() + 1, parent_col_ids.end()); + rmm::device_uvector non_leaf_nodes(num_non_leaf_columns, stream); + rmm::device_uvector non_leaf_nodes_children(num_non_leaf_columns, stream); + thrust::reduce_by_key(rmm::exec_policy_nosync(stream), + parent_col_ids.begin() + 1, + parent_col_ids.end(), + thrust::make_constant_iterator(1), + non_leaf_nodes.begin(), + non_leaf_nodes_children.begin(), + thrust::equal_to()); + + thrust::scatter(rmm::exec_policy_nosync(stream), + non_leaf_nodes_children.begin(), + non_leaf_nodes_children.end(), + non_leaf_nodes.begin(), + row_idx.begin() + 1); + + if (num_columns > 1) { + thrust::transform_inclusive_scan( + rmm::exec_policy_nosync(stream), + thrust::make_zip_iterator(thrust::make_counting_iterator(1), row_idx.begin() + 1), + thrust::make_zip_iterator(thrust::make_counting_iterator(1) + num_columns, row_idx.end()), + row_idx.begin() + 1, + cuda::proclaim_return_type([] __device__(auto a) { + auto n = thrust::get<0>(a); + auto idx = thrust::get<1>(a); + return n == 1 ? idx : idx + 1; + }), + thrust::plus{}); + } else { + auto single_node = 1; + row_idx.set_element_async(1, single_node, stream); + } + +#ifdef CSR_DEBUG_PRINT + print(row_idx, "h_row_idx", stream); +#endif + return row_idx; + }; + + auto construct_col_idx = [&stream](NodeIndexT num_columns, + device_span parent_col_ids, + device_span row_idx) { + rmm::device_uvector col_idx((num_columns - 1) * 2, stream); + thrust::fill(rmm::exec_policy_nosync(stream), col_idx.begin(), col_idx.end(), -1); + // excluding root node, construct scatter map + rmm::device_uvector map(num_columns - 1, stream); + thrust::inclusive_scan_by_key(rmm::exec_policy_nosync(stream), + parent_col_ids.begin() + 1, + parent_col_ids.end(), + thrust::make_constant_iterator(1), + map.begin()); + thrust::for_each_n(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(1), + num_columns - 1, + [row_idx = row_idx.begin(), + map = map.begin(), + parent_col_ids = parent_col_ids.begin()] __device__(auto i) { + auto parent_col_id = parent_col_ids[i]; + if (parent_col_id == 0) + --map[i - 1]; + else + map[i - 1] += row_idx[parent_col_id]; + }); + thrust::scatter(rmm::exec_policy_nosync(stream), + thrust::make_counting_iterator(1), + thrust::make_counting_iterator(1) + num_columns - 1, + map.begin(), + col_idx.begin()); + + // Skip the parent of root node + thrust::scatter(rmm::exec_policy_nosync(stream), + parent_col_ids.begin() + 1, + parent_col_ids.end(), + row_idx.begin() + 1, + col_idx.begin()); + +#ifdef CSR_DEBUG_PRINT + print(col_idx, "h_col_idx", stream); +#endif + + return col_idx; + }; + + /* + 5. CSR construction: + a. Sort column levels and get their ordering + b. For each column node coln iterated according to sorted_column_levels; do + i. Find nodes that have coln as the parent node -> set adj_coln + ii. row idx[coln] = size of adj_coln + 1 + iii. col idx[coln] = adj_coln U {parent_col_id[coln]} + */ + auto row_idx = construct_row_idx(num_columns, parent_col_ids); + auto col_idx = construct_col_idx(num_columns, parent_col_ids, row_idx); + + return std::tuple{ + compressed_sparse_row{std::move(row_idx), std::move(col_idx)}, + column_tree_properties{ + std::move(column_categories), std::move(max_row_offsets), std::move(mapped_col_ids)}}; +} + +} // namespace experimental::detail +} // namespace cudf::io::json diff --git a/cpp/src/io/json/host_tree_algorithms.cu b/cpp/src/io/json/host_tree_algorithms.cu index 70d61132b42..5855f1b5a5f 100644 --- a/cpp/src/io/json/host_tree_algorithms.cu +++ b/cpp/src/io/json/host_tree_algorithms.cu @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -43,6 +44,7 @@ #include #include +#include namespace cudf::io::json::detail { @@ -58,16 +60,15 @@ namespace cudf::io::json::detail { */ rmm::device_uvector get_values_column_indices(TreeDepthT const row_array_children_level, tree_meta_t const& d_tree, - device_span col_ids, + device_span col_ids, size_type const num_columns, rmm::cuda_stream_view stream) { - CUDF_FUNC_RANGE(); auto [level2_nodes, level2_indices] = get_array_children_indices( row_array_children_level, d_tree.node_levels, d_tree.parent_node_ids, stream); auto col_id_location = thrust::make_permutation_iterator(col_ids.begin(), level2_nodes.begin()); rmm::device_uvector values_column_indices(num_columns, stream); - thrust::scatter(rmm::exec_policy(stream), + thrust::scatter(rmm::exec_policy_nosync(stream), level2_indices.begin(), level2_indices.end(), col_id_location, @@ -90,12 +91,11 @@ std::vector copy_strings_to_host_sync( device_span node_range_end, rmm::cuda_stream_view stream) { - CUDF_FUNC_RANGE(); auto const num_strings = node_range_begin.size(); rmm::device_uvector string_offsets(num_strings, stream); rmm::device_uvector string_lengths(num_strings, stream); auto d_offset_pairs = thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin()); - thrust::transform(rmm::exec_policy(stream), + thrust::transform(rmm::exec_policy_nosync(stream), d_offset_pairs, d_offset_pairs + num_strings, thrust::make_zip_iterator(string_offsets.begin(), string_lengths.begin()), @@ -161,18 +161,18 @@ std::vector copy_strings_to_host_sync( rmm::device_uvector is_all_nulls_each_column(device_span input, tree_meta_t const& d_column_tree, tree_meta_t const& tree, - device_span col_ids, + device_span col_ids, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream) { auto const num_nodes = col_ids.size(); auto const num_cols = d_column_tree.node_categories.size(); rmm::device_uvector is_all_nulls(num_cols, stream); - thrust::fill(rmm::exec_policy(stream), is_all_nulls.begin(), is_all_nulls.end(), true); + thrust::fill(rmm::exec_policy_nosync(stream), is_all_nulls.begin(), is_all_nulls.end(), true); auto parse_opt = parsing_options(options, stream); thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::counting_iterator(0), num_nodes, [options = parse_opt.view(), @@ -193,7 +193,7 @@ rmm::device_uvector is_all_nulls_each_column(device_span return is_all_nulls; } -NodeIndexT get_row_array_parent_col_id(device_span col_ids, +NodeIndexT get_row_array_parent_col_id(device_span col_ids, bool is_enabled_lines, rmm::cuda_stream_view stream) { @@ -221,33 +221,34 @@ struct json_column_data { bitmask_type* validity; }; -std::pair, - std::unordered_map>> -build_tree(device_json_column& root, - std::vector const& is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr); -void scatter_offsets( - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids +using hashmap_of_device_columns = + std::unordered_map>; + +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, tree_meta_t& d_column_tree, - host_span ignore_vals, - std::unordered_map>& columns, - rmm::cuda_stream_view stream); + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream); /** * @brief Constructs `d_json_column` from node tree representation - * Newly constructed columns are insert into `root`'s children. + * Newly constructed columns are inserted into `root`'s children. * `root` must be a list type. * * @param input Input JSON string device data @@ -265,28 +266,28 @@ void scatter_offsets( * of child_offets and validity members of `d_json_column` */ void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, device_json_column& root, bool is_array_of_arrays, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - CUDF_FUNC_RANGE(); - bool const is_enabled_lines = options.is_enabled_lines(); bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); - auto const num_nodes = col_ids.size(); - rmm::device_uvector sorted_col_ids(col_ids.size(), stream); // make a copy - thrust::copy(rmm::exec_policy(stream), col_ids.begin(), col_ids.end(), sorted_col_ids.begin()); + // make a copy + auto sorted_col_ids = cudf::detail::make_device_uvector_async( + col_ids, stream, cudf::get_current_device_resource_ref()); // sort by {col_id} on {node_ids} stable rmm::device_uvector node_ids(col_ids.size(), stream); - thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); - thrust::stable_sort_by_key( - rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + node_ids.begin()); NodeIndexT const row_array_parent_col_id = get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); @@ -316,7 +317,7 @@ void make_device_json_column(device_span input, cudf::detail::make_host_vector_sync(values_column_indices, stream); std::transform(unique_col_ids.begin(), unique_col_ids.end(), - column_names.begin(), + column_names.cbegin(), column_names.begin(), [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( auto col_id, auto name) mutable { @@ -333,17 +334,17 @@ void make_device_json_column(device_span input, } return std::vector(); }(); - auto [ignore_vals, columns] = build_tree(root, - is_str_column_all_nulls, - d_column_tree, - d_unique_col_ids, - d_max_row_offsets, - column_names, - row_array_parent_col_id, - is_array_of_arrays, - options, - stream, - mr); + auto const [ignore_vals, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); scatter_offsets(tree, col_ids, @@ -356,19 +357,18 @@ void make_device_json_column(device_span input, stream); } -std::pair, - std::unordered_map>> -build_tree(device_json_column& root, - std::vector const& is_str_column_all_nulls, - tree_meta_t& d_column_tree, - device_span d_unique_col_ids, - device_span d_max_row_offsets, - std::vector const& column_names, - NodeIndexT row_array_parent_col_id, - bool is_array_of_arrays, - cudf::io::json_reader_options const& options, - rmm::cuda_stream_view stream, - rmm::device_async_resource_ref mr) +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) { bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); @@ -380,6 +380,7 @@ build_tree(device_json_column& root, cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); auto num_columns = d_unique_col_ids.size(); + stream.synchronize(); auto to_json_col_type = [](auto category) { switch (category) { @@ -439,11 +440,12 @@ build_tree(device_json_column& root, }); // use hash map because we may skip field name's col_ids - std::unordered_map> columns; + hashmap_of_device_columns columns; // map{parent_col_id, child_col_name}> = child_col_id, used for null value column tracking std::map, NodeIndexT> mapped_columns; // find column_ids which are values, but should be ignored in validity - auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); + auto ignore_vals = cudf::detail::make_host_vector(num_columns, stream); + std::fill(ignore_vals.begin(), ignore_vals.end(), false); std::vector is_mixed_type_column(num_columns, 0); std::vector is_pruned(num_columns, 0); // for columns that are not mixed type but have been forced as string @@ -452,7 +454,7 @@ build_tree(device_json_column& root, std::function remove_child_columns = [&](NodeIndexT this_col_id, device_json_column& col) { - for (auto col_name : col.column_order) { + for (auto const& col_name : col.column_order) { auto child_id = mapped_columns[{this_col_id, col_name}]; is_mixed_type_column[child_id] = 1; remove_child_columns(child_id, col.child_columns.at(col_name)); @@ -523,7 +525,7 @@ build_tree(device_json_column& root, if (parent_col_id != parent_node_sentinel && (is_mixed_type_column[parent_col_id] || is_pruned[this_col_id]) || forced_as_string_column[parent_col_id]) { - ignore_vals[this_col_id] = 1; + ignore_vals[this_col_id] = true; if (is_mixed_type_column[parent_col_id]) { is_mixed_type_column[this_col_id] = 1; } if (forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; } continue; @@ -569,12 +571,12 @@ build_tree(device_json_column& root, } if (column_categories[this_col_id] == NC_VAL || column_categories[this_col_id] == NC_STR) { - ignore_vals[this_col_id] = 1; + ignore_vals[this_col_id] = true; continue; } if (column_categories[old_col_id] == NC_VAL || column_categories[old_col_id] == NC_STR) { // remap - ignore_vals[old_col_id] = 1; + ignore_vals[old_col_id] = true; mapped_columns.erase({parent_col_id, name}); columns.erase(old_col_id); parent_col.child_columns.erase(name); @@ -624,7 +626,7 @@ build_tree(device_json_column& root, auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id != parent_node_sentinel and is_mixed_type_column[parent_col_id] == 1) { is_mixed_type_column[this_col_id] = 1; - ignore_vals[this_col_id] = 1; + ignore_vals[this_col_id] = true; columns.erase(this_col_id); } // Convert only mixed type columns as string (so to copy), but not its children @@ -644,7 +646,7 @@ build_tree(device_json_column& root, auto parent_col_id = column_parent_ids[this_col_id]; if (parent_col_id != parent_node_sentinel and forced_as_string_column[parent_col_id]) { forced_as_string_column[this_col_id] = true; - ignore_vals[this_col_id] = 1; + ignore_vals[this_col_id] = true; } // Convert only mixed type columns as string (so to copy), but not its children if (parent_col_id != parent_node_sentinel and not forced_as_string_column[parent_col_id] and @@ -664,16 +666,15 @@ build_tree(device_json_column& root, return {ignore_vals, columns}; } -void scatter_offsets( - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, - device_span node_ids, - device_span sorted_col_ids, // Reuse this for parent_col_ids - tree_meta_t& d_column_tree, - host_span ignore_vals, - std::unordered_map>& columns, - rmm::cuda_stream_view stream) +void scatter_offsets(tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_span node_ids, + device_span sorted_col_ids, // Reuse this for parent_col_ids + tree_meta_t const& d_column_tree, + host_span ignore_vals, + hashmap_of_device_columns const& columns, + rmm::cuda_stream_view stream) { auto const num_nodes = col_ids.size(); auto const num_columns = d_column_tree.node_categories.size(); @@ -695,7 +696,7 @@ void scatter_offsets( // 3. scatter string offsets to respective columns, set validity bits thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::counting_iterator(0), num_nodes, [column_categories = d_column_tree.node_categories.begin(), @@ -739,7 +740,7 @@ void scatter_offsets( : col_ids[parent_node_ids[node_id]]; })); auto const list_children_end = thrust::copy_if( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id), thrust::make_zip_iterator(thrust::make_counting_iterator(0), parent_col_id) + num_nodes, @@ -757,12 +758,12 @@ void scatter_offsets( auto const num_list_children = list_children_end - thrust::make_zip_iterator(node_ids.begin(), parent_col_ids.begin()); - thrust::stable_sort_by_key(rmm::exec_policy(stream), + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), parent_col_ids.begin(), parent_col_ids.begin() + num_list_children, node_ids.begin()); thrust::for_each_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_counting_iterator(0), num_list_children, [node_ids = node_ids.begin(), @@ -805,4 +806,599 @@ void scatter_offsets( stream.synchronize(); } +namespace experimental { + +std::map unified_schema(cudf::io::json_reader_options const& options) +{ + return std::visit( + cudf::detail::visitor_overload{ + [](std::vector const& user_dtypes) { + std::map dnew; + std::transform(thrust::counting_iterator(0), + thrust::counting_iterator(user_dtypes.size()), + std::inserter(dnew, dnew.end()), + [&user_dtypes](auto i) { + return std::pair(std::to_string(i), schema_element{user_dtypes[i]}); + }); + return dnew; + }, + [](std::map const& user_dtypes) { + std::map dnew; + std::transform(user_dtypes.begin(), + user_dtypes.end(), + std::inserter(dnew, dnew.end()), + [](auto key_dtype) { + return std::pair(key_dtype.first, schema_element{key_dtype.second}); + }); + return dnew; + }, + [](std::map const& user_dtypes) { return user_dtypes; }}, + options.get_dtypes()); +} + +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); + +/** + * @brief Constructs `d_json_column` from node tree representation + * Newly constructed columns are inserted into `root`'s children. + * `root` must be a list type. + * + * @param input Input JSON string device data + * @param tree Node tree representation of the JSON string + * @param col_ids Column ids of the nodes in the tree + * @param row_offsets Row offsets of the nodes in the tree + * @param root Root node of the `d_json_column` tree + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param options Parsing options specifying the parsing behaviour + * options affecting behaviour are + * is_enabled_lines: Whether the input is a line-delimited JSON + * is_enabled_mixed_types_as_string: Whether to enable reading mixed types as string + * @param stream CUDA stream used for device memory operations and kernel launches + * @param mr Device memory resource used to allocate the device memory + * of child_offets and validity members of `d_json_column` + */ +void make_device_json_column(device_span input, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + bool const is_enabled_lines = options.is_enabled_lines(); + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + // make a copy + auto sorted_col_ids = cudf::detail::make_device_uvector_async( + col_ids, stream, cudf::get_current_device_resource_ref()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(col_ids.size(), stream); + thrust::sequence(rmm::exec_policy_nosync(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key(rmm::exec_policy_nosync(stream), + sorted_col_ids.begin(), + sorted_col_ids.end(), + node_ids.begin()); + + NodeIndexT const row_array_parent_col_id = + get_row_array_parent_col_id(col_ids, is_enabled_lines, stream); + + // 1. gather column information. + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + reduce_to_column_tree(tree, + col_ids, + sorted_col_ids, + node_ids, + row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + + auto num_columns = d_unique_col_ids.size(); + std::vector column_names = copy_strings_to_host_sync( + input, d_column_tree.node_range_begin, d_column_tree.node_range_end, stream); + // array of arrays column names + if (is_array_of_arrays) { + auto const unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + TreeDepthT const row_array_children_level = is_enabled_lines ? 1 : 2; + auto values_column_indices = + get_values_column_indices(row_array_children_level, tree, col_ids, num_columns, stream); + auto h_values_column_indices = + cudf::detail::make_host_vector_sync(values_column_indices, stream); + std::transform(unique_col_ids.begin(), + unique_col_ids.end(), + column_names.cbegin(), + column_names.begin(), + [&h_values_column_indices, &column_parent_ids, row_array_parent_col_id]( + auto col_id, auto name) mutable { + return column_parent_ids[col_id] == row_array_parent_col_id + ? std::to_string(h_values_column_indices[col_id]) + : name; + }); + } + + auto const is_str_column_all_nulls = [&, &column_tree = d_column_tree]() { + if (is_enabled_mixed_types_as_string) { + return cudf::detail::make_std_vector_sync( + is_all_nulls_each_column(input, column_tree, tree, col_ids, options, stream), stream); + } + return std::vector(); + }(); + auto const [ignore_vals, columns] = build_tree(root, + is_str_column_all_nulls, + d_column_tree, + d_unique_col_ids, + d_max_row_offsets, + column_names, + row_array_parent_col_id, + is_array_of_arrays, + options, + stream, + mr); + if (ignore_vals.empty()) return; + scatter_offsets(tree, + col_ids, + row_offsets, + node_ids, + sorted_col_ids, + d_column_tree, + ignore_vals, + columns, + stream); +} + +std::pair, hashmap_of_device_columns> build_tree( + device_json_column& root, + host_span is_str_column_all_nulls, + tree_meta_t& d_column_tree, + device_span d_unique_col_ids, + device_span d_max_row_offsets, + std::vector const& column_names, + NodeIndexT row_array_parent_col_id, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr) +{ + bool const is_enabled_lines = options.is_enabled_lines(); + bool const is_enabled_mixed_types_as_string = options.is_enabled_mixed_types_as_string(); + auto unique_col_ids = cudf::detail::make_host_vector_async(d_unique_col_ids, stream); + auto column_categories = + cudf::detail::make_host_vector_async(d_column_tree.node_categories, stream); + auto const column_parent_ids = + cudf::detail::make_host_vector_async(d_column_tree.parent_node_ids, stream); + auto column_range_beg = + cudf::detail::make_host_vector_async(d_column_tree.node_range_begin, stream); + auto const max_row_offsets = cudf::detail::make_host_vector_async(d_max_row_offsets, stream); + auto num_columns = d_unique_col_ids.size(); + stream.synchronize(); + + auto to_json_col_type = [](auto category) { + switch (category) { + case NC_STRUCT: return json_col_t::StructColumn; + case NC_LIST: return json_col_t::ListColumn; + case NC_STR: [[fallthrough]]; + case NC_VAL: return json_col_t::StringColumn; + default: return json_col_t::Unknown; + } + }; + + auto initialize_json_columns = [&](auto i, auto& col_ref, auto column_category) { + auto& col = col_ref.get(); + if (col.type != json_col_t::Unknown) { return; } + if (column_category == NC_ERR || column_category == NC_FN) { + return; + } else if (column_category == NC_VAL || column_category == NC_STR) { + col.string_offsets.resize(max_row_offsets[i] + 1, stream); + col.string_lengths.resize(max_row_offsets[i] + 1, stream); + thrust::fill( + rmm::exec_policy_nosync(stream), + thrust::make_zip_iterator(col.string_offsets.begin(), col.string_lengths.begin()), + thrust::make_zip_iterator(col.string_offsets.end(), col.string_lengths.end()), + thrust::make_tuple(0, 0)); + } else if (column_category == NC_LIST) { + col.child_offsets.resize(max_row_offsets[i] + 2, stream); + thrust::uninitialized_fill( + rmm::exec_policy_nosync(stream), col.child_offsets.begin(), col.child_offsets.end(), 0); + } + col.num_rows = max_row_offsets[i] + 1; + col.validity = + cudf::detail::create_null_mask(col.num_rows, cudf::mask_state::ALL_NULL, stream, mr); + col.type = to_json_col_type(column_category); + }; + + // 2. generate nested columns tree and its device_memory + // reorder unique_col_ids w.r.t. column_range_begin for order of column to be in field order. + auto h_range_col_id_it = + thrust::make_zip_iterator(column_range_beg.begin(), unique_col_ids.begin()); + std::sort(h_range_col_id_it, h_range_col_id_it + num_columns, [](auto const& a, auto const& b) { + return thrust::get<0>(a) < thrust::get<0>(b); + }); + // adjacency list construction + std::map> adj; + for (auto const this_col_id : unique_col_ids) { + auto parent_col_id = column_parent_ids[this_col_id]; + adj[parent_col_id].push_back(this_col_id); + } + + // Pruning + auto is_pruned = cudf::detail::make_host_vector(num_columns, stream); + std::fill_n(is_pruned.begin(), num_columns, options.is_enabled_prune_columns()); + + // prune all children of a column, but not self. + auto ignore_all_children = [&](auto parent_col_id) { + std::deque offspring; + if (adj.count(parent_col_id)) { + for (auto const& child : adj[parent_col_id]) { + offspring.push_back(child); + } + } + while (!offspring.empty()) { + auto this_id = offspring.front(); + offspring.pop_front(); + is_pruned[this_id] = true; + if (adj.count(this_id)) { + for (auto const& child : adj[this_id]) { + offspring.push_back(child); + } + } + } + }; + + // Pruning: iterate through schema and mark only those columns and enforce type. + // NoPruning: iterate through schema and enforce type. + + if (adj[parent_node_sentinel].empty()) + return {cudf::detail::make_host_vector(0, stream), {}}; // for empty file + CUDF_EXPECTS(adj[parent_node_sentinel].size() == 1, "Should be 1"); + auto expected_types = cudf::detail::make_host_vector(num_columns, stream); + std::fill_n(expected_types.begin(), num_columns, NUM_NODE_CLASSES); + + auto lookup_names = [&column_names](auto child_ids, auto name) { + for (auto const& child_id : child_ids) { + if (column_names[child_id] == name) return child_id; + } + return -1; + }; + // recursive lambda on schema to mark columns as pruned. + std::function mark_is_pruned; + mark_is_pruned = [&is_pruned, + &mark_is_pruned, + &adj, + &lookup_names, + &column_categories, + &expected_types, + &ignore_all_children](NodeIndexT root, schema_element const& schema) -> void { + if (root == -1) return; + bool pass = + (schema.type == data_type{type_id::STRUCT} and column_categories[root] == NC_STRUCT) or + (schema.type == data_type{type_id::LIST} and column_categories[root] == NC_LIST) or + (schema.type != data_type{type_id::STRUCT} and schema.type != data_type{type_id::LIST} and + column_categories[root] != NC_FN); + if (!pass) { + // ignore all children of this column and prune this column. + is_pruned[root] = true; + ignore_all_children(root); + return; + } + is_pruned[root] = false; + auto expected_type = [](auto type, auto cat) { + if (type == data_type{type_id::STRUCT} and cat == NC_STRUCT) return NC_STRUCT; + if (type == data_type{type_id::LIST} and cat == NC_LIST) return NC_LIST; + if (type != data_type{type_id::STRUCT} and type != data_type{type_id::LIST}) return NC_STR; + return NC_ERR; + }(schema.type, column_categories[root]); + expected_types[root] = expected_type; // forced type. + // ignore children of nested columns, but not self. + if (expected_type == NC_STR and + (column_categories[root] == NC_STRUCT or column_categories[root] == NC_LIST)) + ignore_all_children(root); + if (not(schema.type == data_type{type_id::STRUCT} or schema.type == data_type{type_id::LIST})) + return; // no children to mark for non-nested. + auto child_ids = adj.count(root) ? adj[root] : std::vector{}; + if (schema.type == data_type{type_id::STRUCT}) { + for (auto const& key_pair : schema.child_types) { + auto col_id = lookup_names(child_ids, key_pair.first); + if (col_id == -1) continue; + is_pruned[col_id] = false; + for (auto const& child_id : adj[col_id]) // children of field (>1 if mixed) + mark_is_pruned(child_id, key_pair.second); + } + } else if (schema.type == data_type{type_id::LIST}) { + // partial solution for list children to have any name. + auto this_list_child_name = + schema.child_types.size() == 1 ? schema.child_types.begin()->first : list_child_name; + if (schema.child_types.count(this_list_child_name) == 0) return; + auto list_child = schema.child_types.at(this_list_child_name); + for (auto const& child_id : child_ids) + mark_is_pruned(child_id, list_child); + } + }; + if (is_array_of_arrays) { + if (adj[adj[parent_node_sentinel][0]].empty()) + return {cudf::detail::make_host_vector(0, stream), {}}; + auto root_list_col_id = + is_enabled_lines ? adj[parent_node_sentinel][0] : adj[adj[parent_node_sentinel][0]][0]; + // mark root and row array col_id as not pruned. + if (!is_enabled_lines) { + auto top_level_list_id = adj[parent_node_sentinel][0]; + is_pruned[top_level_list_id] = false; + } + is_pruned[root_list_col_id] = false; + std::visit(cudf::detail::visitor_overload{ + [&root_list_col_id, &adj, &mark_is_pruned, &column_names]( + std::vector const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_list_col_id].size() && i < user_dtypes.size(); + i++) { + NodeIndexT const first_child_id = adj[root_list_col_id][i]; + auto name = column_names[first_child_id]; + auto value_id = std::stol(name); + if (value_id >= 0 and value_id < static_cast(user_dtypes.size())) + mark_is_pruned(first_child_id, schema_element{user_dtypes[value_id]}); + // Note: mixed type - forced type, will work here. + } + }, + [&root_list_col_id, &adj, &mark_is_pruned, &column_names]( + std::map const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { + auto const first_child_id = adj[root_list_col_id][i]; + auto name = column_names[first_child_id]; + if (user_dtypes.count(name)) + mark_is_pruned(first_child_id, schema_element{user_dtypes.at(name)}); + } + }, + [&root_list_col_id, &adj, &mark_is_pruned, &column_names]( + std::map const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_list_col_id].size(); i++) { + auto const first_child_id = adj[root_list_col_id][i]; + auto name = column_names[first_child_id]; + if (user_dtypes.count(name)) + mark_is_pruned(first_child_id, user_dtypes.at(name)); + } + }}, + options.get_dtypes()); + } else { + auto root_struct_col_id = + is_enabled_lines + ? adj[parent_node_sentinel][0] + : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]); + // mark root and row struct col_id as not pruned. + if (!is_enabled_lines) { + auto top_level_list_id = adj[parent_node_sentinel][0]; + is_pruned[top_level_list_id] = false; + } + is_pruned[root_struct_col_id] = false; + schema_element u_schema{data_type{type_id::STRUCT}}; + u_schema.child_types = unified_schema(options); + std::visit( + cudf::detail::visitor_overload{ + [&is_pruned, &root_struct_col_id, &adj, &mark_is_pruned]( + std::vector const& user_dtypes) -> void { + for (size_t i = 0; i < adj[root_struct_col_id].size() && i < user_dtypes.size(); i++) { + NodeIndexT const first_field_id = adj[root_struct_col_id][i]; + is_pruned[first_field_id] = false; + for (auto const& child_id : adj[first_field_id]) // children of field (>1 if mixed) + mark_is_pruned(child_id, schema_element{user_dtypes[i]}); + } + }, + [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema]( + std::map const& user_dtypes) -> void { + mark_is_pruned(root_struct_col_id, u_schema); + }, + [&root_struct_col_id, &adj, &mark_is_pruned, &u_schema]( + std::map const& user_dtypes) -> void { + mark_is_pruned(root_struct_col_id, u_schema); + }}, + options.get_dtypes()); + } + // Useful for array of arrays + auto named_level = + is_enabled_lines + ? adj[parent_node_sentinel][0] + : (adj[adj[parent_node_sentinel][0]].empty() ? -1 : adj[adj[parent_node_sentinel][0]][0]); + + auto handle_mixed_types = [&column_categories, + &is_str_column_all_nulls, + &is_pruned, + &expected_types, + &is_enabled_mixed_types_as_string, + &ignore_all_children](std::vector& child_ids) { + // do these on unpruned columns only. + // when mixed types is disabled, ignore string sibling of nested column. + // when mixed types is disabled, and both list and struct columns are siblings, error out. + // when mixed types is enabled, force string type on all columns + + // Remove pruned children (forced type will not clash here because other types are already + // pruned) + child_ids.erase( + std::remove_if(child_ids.begin(), + child_ids.end(), + [&is_pruned](NodeIndexT child_id) { return is_pruned[child_id]; }), + child_ids.end()); + // find string id, struct id, list id. + NodeIndexT str_col_id{-1}, struct_col_id{-1}, list_col_id{-1}; + for (auto const& child_id : child_ids) { + if (column_categories[child_id] == NC_VAL || column_categories[child_id] == NC_STR) + str_col_id = child_id; + else if (column_categories[child_id] == NC_STRUCT) + struct_col_id = child_id; + else if (column_categories[child_id] == NC_LIST) + list_col_id = child_id; + } + // conditions for handling mixed types. + if (is_enabled_mixed_types_as_string) { + if (struct_col_id != -1 and list_col_id != -1) { + expected_types[struct_col_id] = NC_STR; + expected_types[list_col_id] = NC_STR; + // ignore children of nested columns. + ignore_all_children(struct_col_id); + ignore_all_children(list_col_id); + } + if ((struct_col_id != -1 or list_col_id != -1) and str_col_id != -1) { + if (is_str_column_all_nulls[str_col_id]) + is_pruned[str_col_id] = true; + else { + // ignore children of nested columns. + if (struct_col_id != -1) { + expected_types[struct_col_id] = NC_STR; + ignore_all_children(struct_col_id); + } + if (list_col_id != -1) { + expected_types[list_col_id] = NC_STR; + ignore_all_children(list_col_id); + } + } + } + } else { + // if both are present, error out. + CUDF_EXPECTS(struct_col_id == -1 or list_col_id == -1, + "A mix of lists and structs within the same column is not supported"); + // either one only: so ignore str column. + if ((struct_col_id != -1 or list_col_id != -1) and str_col_id != -1) { + is_pruned[str_col_id] = true; + } + } + }; + + using dev_ref = std::reference_wrapper; + std::unordered_map columns; + columns.try_emplace(parent_node_sentinel, std::ref(root)); + // convert adjaceny list to tree. + dev_ref parent_ref = std::ref(root); + // creates children column + std::function construct_tree; + construct_tree = [&](NodeIndexT root, dev_ref ref) -> void { + if (is_pruned[root]) return; + auto expected_category = + expected_types[root] == NUM_NODE_CLASSES ? column_categories[root] : expected_types[root]; + initialize_json_columns(root, ref, expected_category); + auto child_ids = adj.count(root) ? adj[root] : std::vector{}; + if (expected_category == NC_STRUCT) { + // find field column ids, and its children and create columns. + for (auto const& field_id : child_ids) { + auto name = column_names[field_id]; + if (is_pruned[field_id]) continue; + auto inserted = + ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second; + ref.get().column_order.emplace_back(name); + CUDF_EXPECTS(inserted, + "struct child column insertion failed, duplicate column name in the parent"); + auto this_ref = std::ref(ref.get().child_columns.at(name)); + // Mixed type handling + auto& value_col_ids = adj[field_id]; + handle_mixed_types(value_col_ids); + if (value_col_ids.empty()) { + // If no column is present, remove the uninitialized column. + ref.get().child_columns.erase(name); + ref.get().column_order.pop_back(); + continue; + } + for (auto const& child_id : value_col_ids) // children of field (>1 if mixed) + { + if (is_pruned[child_id]) continue; + columns.try_emplace(child_id, this_ref); + construct_tree(child_id, this_ref); + } + } + } else if (expected_category == NC_LIST) { + // array of arrays interpreted as array of structs. + if (is_array_of_arrays and root == named_level) { + // create column names + std::map> array_values; + for (auto const& child_id : child_ids) { + if (is_pruned[child_id]) continue; + auto name = column_names[child_id]; + array_values[std::stoi(name)].push_back(child_id); + } + // + for (auto const& value_id_pair : array_values) { + auto [value_id, value_col_ids] = value_id_pair; + auto name = std::to_string(value_id); + auto inserted = + ref.get().child_columns.try_emplace(name, device_json_column(stream, mr)).second; + ref.get().column_order.emplace_back(name); + CUDF_EXPECTS(inserted, + "list child column insertion failed, duplicate column name in the parent"); + auto this_ref = std::ref(ref.get().child_columns.at(name)); + handle_mixed_types(value_col_ids); + if (value_col_ids.empty()) { + // If no column is present, remove the uninitialized column. + ref.get().child_columns.erase(name); + ref.get().column_order.pop_back(); + continue; + } + for (auto const& child_id : value_col_ids) // children of field (>1 if mixed) + { + if (is_pruned[child_id]) continue; + columns.try_emplace(child_id, this_ref); + construct_tree(child_id, this_ref); + } + } + } else { + if (child_ids.empty()) return; + auto inserted = + ref.get() + .child_columns.try_emplace(list_child_name, device_json_column(stream, mr)) + .second; + CUDF_EXPECTS(inserted, + "list child column insertion failed, duplicate column name in the parent"); + ref.get().column_order.emplace_back(list_child_name); + auto this_ref = std::ref(ref.get().child_columns.at(list_child_name)); + // Mixed type handling + handle_mixed_types(child_ids); + if (child_ids.empty()) { + // If no column is present, remove the uninitialized column. + ref.get().child_columns.erase(list_child_name); + } + for (auto const& child_id : child_ids) { + if (is_pruned[child_id]) continue; + columns.try_emplace(child_id, this_ref); + construct_tree(child_id, this_ref); + } + } + } + }; + auto inserted = parent_ref.get() + .child_columns.try_emplace(list_child_name, device_json_column(stream, mr)) + .second; + CUDF_EXPECTS(inserted, "child column insertion failed, duplicate column name in the parent"); + parent_ref = std::ref(parent_ref.get().child_columns.at(list_child_name)); + columns.try_emplace(adj[parent_node_sentinel][0], parent_ref); + construct_tree(adj[parent_node_sentinel][0], parent_ref); + + // Forced string type due to input schema and mixed type as string. + for (size_t i = 0; i < expected_types.size(); i++) { + if (expected_types[i] == NC_STR) { + if (columns.count(i)) { columns.at(i).get().forced_as_string_column = true; } + } + } + std::transform(expected_types.cbegin(), + expected_types.cend(), + column_categories.cbegin(), + expected_types.begin(), + [](auto exp, auto cat) { return exp == NUM_NODE_CLASSES ? cat : exp; }); + cudaMemcpyAsync(d_column_tree.node_categories.begin(), + expected_types.data(), + expected_types.size() * sizeof(column_categories[0]), + cudaMemcpyDefault, + stream.value()); + + return {is_pruned, columns}; +} +} // namespace experimental + } // namespace cudf::io::json::detail diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu index b08fd139113..912e93d52ae 100644 --- a/cpp/src/io/json/json_column.cu +++ b/cpp/src/io/json/json_column.cu @@ -47,7 +47,6 @@ namespace cudf::io::json::detail { -// DEBUG prints auto to_cat = [](auto v) -> std::string { switch (v) { case NC_STRUCT: return " S"; @@ -105,19 +104,20 @@ void print_tree(host_span input, * max row offsets of columns */ std::tuple, rmm::device_uvector> -reduce_to_column_tree(tree_meta_t& tree, - device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, - device_span row_offsets, +reduce_to_column_tree(tree_meta_t const& tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); + // 1. column count for allocation - auto const num_columns = - thrust::unique_count(rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end()); + auto const num_columns = thrust::unique_count( + rmm::exec_policy_nosync(stream), sorted_col_ids.begin(), sorted_col_ids.end()); // 2. reduce_by_key {col_id}, {row_offset}, max. rmm::device_uvector unique_col_ids(num_columns, stream); @@ -162,30 +162,34 @@ reduce_to_column_tree(tree_meta_t& tree, }); // 4. unique_copy parent_node_ids, ranges - rmm::device_uvector column_levels(0, stream); // not required + rmm::device_uvector column_levels(num_columns, stream); // not required rmm::device_uvector parent_col_ids(num_columns, stream); rmm::device_uvector col_range_begin(num_columns, stream); // Field names rmm::device_uvector col_range_end(num_columns, stream); rmm::device_uvector unique_node_ids(num_columns, stream); - thrust::unique_by_key_copy(rmm::exec_policy(stream), + thrust::unique_by_key_copy(rmm::exec_policy_nosync(stream), sorted_col_ids.begin(), sorted_col_ids.end(), ordered_node_ids.begin(), thrust::make_discard_iterator(), unique_node_ids.begin()); + thrust::copy_n( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), thrust::make_zip_iterator( + thrust::make_permutation_iterator(tree.node_levels.begin(), unique_node_ids.begin()), thrust::make_permutation_iterator(tree.parent_node_ids.begin(), unique_node_ids.begin()), thrust::make_permutation_iterator(tree.node_range_begin.begin(), unique_node_ids.begin()), thrust::make_permutation_iterator(tree.node_range_end.begin(), unique_node_ids.begin())), unique_node_ids.size(), - thrust::make_zip_iterator( - parent_col_ids.begin(), col_range_begin.begin(), col_range_end.begin())); + thrust::make_zip_iterator(column_levels.begin(), + parent_col_ids.begin(), + col_range_begin.begin(), + col_range_end.begin())); // convert parent_node_ids to parent_col_ids thrust::transform( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), parent_col_ids.begin(), parent_col_ids.end(), parent_col_ids.begin(), @@ -203,18 +207,17 @@ reduce_to_column_tree(tree_meta_t& tree, column_categories[parent_col_id] == NC_LIST && (!is_array_of_arrays || parent_col_id != row_array_parent_col_id)); }; + // Mixed types in List children go to different columns, // so all immediate children of list column should have same max_row_offsets. // create list's children max_row_offsets array. (initialize to zero) // atomicMax on children max_row_offsets array. // gather the max_row_offsets from children row offset array. { - rmm::device_uvector list_parents_children_max_row_offsets(num_columns, stream); - thrust::fill(rmm::exec_policy(stream), - list_parents_children_max_row_offsets.begin(), - list_parents_children_max_row_offsets.end(), - 0); - thrust::for_each(rmm::exec_policy(stream), + auto list_parents_children_max_row_offsets = + cudf::detail::make_zeroed_device_uvector_async( + static_cast(num_columns), stream, cudf::get_current_device_resource_ref()); + thrust::for_each(rmm::exec_policy_nosync(stream), unique_col_ids.begin(), unique_col_ids.end(), [column_categories = column_categories.begin(), @@ -230,8 +233,9 @@ reduce_to_column_tree(tree_meta_t& tree, ref.fetch_max(max_row_offsets[col_id], cuda::std::memory_order_relaxed); } }); + thrust::gather_if( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), parent_col_ids.begin(), parent_col_ids.end(), parent_col_ids.begin(), @@ -246,7 +250,7 @@ reduce_to_column_tree(tree_meta_t& tree, // copy lists' max_row_offsets to children. // all structs should have same size. thrust::transform_if( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), unique_col_ids.begin(), unique_col_ids.end(), max_row_offsets.begin(), @@ -272,7 +276,7 @@ reduce_to_column_tree(tree_meta_t& tree, // For Struct and List (to avoid copying entire strings when mixed type as string is enabled) thrust::transform_if( - rmm::exec_policy(stream), + rmm::exec_policy_nosync(stream), col_range_begin.begin(), col_range_begin.end(), column_categories.begin(), @@ -313,7 +317,7 @@ std::pair, std::vector> device_json_co // Note: json_col modified here, moves this memory }; - auto get_child_schema = [schema](auto child_name) -> std::optional { + auto get_child_schema = [&schema](auto child_name) -> std::optional { if (schema.has_value()) { auto const result = schema.value().child_types.find(child_name); if (result != std::end(schema.value().child_types)) { return result->second; } @@ -321,6 +325,13 @@ std::pair, std::vector> device_json_co return {}; }; + auto get_list_child_schema = [&schema]() -> std::optional { + if (schema.has_value()) { + if (schema.value().child_types.size() > 0) return schema.value().child_types.begin()->second; + } + return {}; + }; + switch (json_col.type) { case json_col_t::StringColumn: { // move string_offsets to GPU and transform to string column @@ -435,9 +446,8 @@ std::pair, std::vector> device_json_co rmm::device_buffer{}, 0); // Create children column - auto child_schema_element = json_col.child_columns.empty() - ? std::optional{} - : get_child_schema(json_col.child_columns.begin()->first); + auto child_schema_element = + json_col.child_columns.empty() ? std::optional{} : get_list_child_schema(); auto [child_column, names] = json_col.child_columns.empty() or (prune_columns and !child_schema_element.has_value()) ? std::pair, @@ -475,6 +485,16 @@ std::pair, std::vector> device_json_co } } +template +auto make_device_json_column_dispatch(bool experimental, Args&&... args) +{ + if (experimental) { + return experimental::make_device_json_column(std::forward(args)...); + } else { + return make_device_json_column(std::forward(args)...); + } +} + table_with_metadata device_parse_nested_json(device_span d_input, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, @@ -520,6 +540,7 @@ table_with_metadata device_parse_nested_json(device_span d_input, gpu_tree, is_array_of_arrays, options.is_enabled_lines(), + options.is_enabled_experimental(), stream, cudf::get_current_device_resource_ref()); @@ -532,15 +553,16 @@ table_with_metadata device_parse_nested_json(device_span d_input, 0); // Get internal JSON column - make_device_json_column(d_input, - gpu_tree, - gpu_col_id, - gpu_row_offsets, - root_column, - is_array_of_arrays, - options, - stream, - mr); + make_device_json_column_dispatch(options.is_enabled_experimental(), + d_input, + gpu_tree, + gpu_col_id, + gpu_row_offsets, + root_column, + is_array_of_arrays, + options, + stream, + mr); // data_root refers to the root column of the data represented by the given JSON string auto& data_root = diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu index 4d0dc010c57..d949635c1cc 100644 --- a/cpp/src/io/json/json_tree.cu +++ b/cpp/src/io/json/json_tree.cu @@ -14,17 +14,18 @@ * limitations under the License. */ -#include "io/utilities/hostdevice_vector.hpp" +#include "io/utilities/parsing_utils.cuh" +#include "io/utilities/string_parsing.hpp" #include "nested_json.hpp" #include #include -#include #include #include #include #include #include +#include #include #include #include @@ -34,12 +35,14 @@ #include #include +#include #include #include #include #include #include #include +#include #include #include #include @@ -492,6 +495,85 @@ tree_meta_t get_tree_representation(device_span tokens, std::move(node_range_end)}; } +// Return field node ids after unicode decoding of field names and matching them to same field names +std::pair> remapped_field_nodes_after_unicode_decode( + device_span d_input, + tree_meta_t const& d_tree, + device_span keys, + rmm::cuda_stream_view stream) +{ + size_t num_keys = keys.size(); + if (num_keys == 0) { return {num_keys, rmm::device_uvector(num_keys, stream)}; } + rmm::device_uvector offsets(num_keys, stream); + rmm::device_uvector lengths(num_keys, stream); + auto offset_length_it = thrust::make_zip_iterator(offsets.begin(), lengths.begin()); + thrust::transform(rmm::exec_policy_nosync(stream), + keys.begin(), + keys.end(), + offset_length_it, + [node_range_begin = d_tree.node_range_begin.data(), + node_range_end = d_tree.node_range_end.data()] __device__(auto key) { + return thrust::make_tuple(node_range_begin[key], + node_range_end[key] - node_range_begin[key]); + }); + cudf::io::parse_options_view opt{',', '\n', '\0', '.'}; + opt.keepquotes = true; + + auto utf8_decoded_fields = parse_data(d_input.data(), + offset_length_it, + num_keys, + data_type{type_id::STRING}, + rmm::device_buffer{}, + 0, + opt, + stream, + cudf::get_current_device_resource_ref()); + // hash using iter, create a hashmap for 0-num_keys. + // insert and find. -> array + // store to static_map with keys as field key[index], and values as key[array[index]] + + auto str_view = strings_column_view{utf8_decoded_fields->view()}; + auto const char_ptr = str_view.chars_begin(stream); + auto const offset_ptr = str_view.offsets().begin(); + + // String hasher + auto const d_hasher = cuda::proclaim_return_type< + typename cudf::hashing::detail::default_hash::result_type>( + [char_ptr, offset_ptr] __device__(auto node_id) { + auto const field_name = cudf::string_view(char_ptr + offset_ptr[node_id], + offset_ptr[node_id + 1] - offset_ptr[node_id]); + return cudf::hashing::detail::default_hash{}(field_name); + }); + auto const d_equal = [char_ptr, offset_ptr] __device__(auto node_id1, auto node_id2) { + auto const field_name1 = cudf::string_view(char_ptr + offset_ptr[node_id1], + offset_ptr[node_id1 + 1] - offset_ptr[node_id1]); + auto const field_name2 = cudf::string_view(char_ptr + offset_ptr[node_id2], + offset_ptr[node_id2 + 1] - offset_ptr[node_id2]); + return field_name1 == field_name2; + }; + + using hasher_type = decltype(d_hasher); + constexpr size_type empty_node_index_sentinel = -1; + auto key_set = cuco::static_set{ + cuco::extent{compute_hash_table_size(num_keys)}, + cuco::empty_key{empty_node_index_sentinel}, + d_equal, + cuco::linear_probing<1, hasher_type>{d_hasher}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + auto const counting_iter = thrust::make_counting_iterator(0); + rmm::device_uvector found_keys(num_keys, stream); + key_set.insert_and_find_async(counting_iter, + counting_iter + num_keys, + found_keys.begin(), + thrust::make_discard_iterator(), + stream.value()); + // set.size will synchronize the stream before return. + return {key_set.size(stream), std::move(found_keys)}; +} + /** * @brief Generates unique node_type id for each node. * Field nodes with the same name are assigned the same node_type id. @@ -500,11 +582,14 @@ tree_meta_t get_tree_representation(device_span tokens, * All inputs and outputs are in node_id order. * @param d_input JSON string in device memory * @param d_tree Tree representation of the JSON + * @param is_enabled_experimental Whether to enable experimental features such as + * utf8 field name support * @param stream CUDA stream used for device memory operations and kernel launches. * @return Vector of node_type ids */ rmm::device_uvector hash_node_type_with_field_name(device_span d_input, tree_meta_t const& d_tree, + bool is_enabled_experimental, rmm::cuda_stream_view stream) { CUDF_FUNC_RANGE(); @@ -536,7 +621,7 @@ rmm::device_uvector hash_node_type_with_field_name(device_span(0); + auto const counting_iter = thrust::make_counting_iterator(0); auto const is_field_name_node = [node_categories = d_tree.node_categories.data()] __device__(auto node_id) { @@ -554,15 +639,61 @@ rmm::device_uvector hash_node_type_with_field_name(device_span{rmm::mr::polymorphic_allocator{}, stream}, stream.value()}; - key_set.insert_if_async(iter, - iter + num_nodes, + key_set.insert_if_async(counting_iter, + counting_iter + num_nodes, thrust::counting_iterator(0), // stencil is_field_name_node, stream.value()); + // experimental feature: utf8 field name support + // parse_data on field names, + // rehash it using another map, + // reassign the reverse map values to new matched node indices. + auto get_utf8_matched_field_nodes = [&]() { + auto make_map = [&stream](auto num_keys) { + using hasher_type3 = cudf::hashing::detail::default_hash; + return cuco::static_map{ + cuco::extent{compute_hash_table_size(num_keys, 100)}, // 100% occupancy + cuco::empty_key{empty_node_index_sentinel}, + cuco::empty_value{empty_node_index_sentinel}, + {}, + cuco::linear_probing<1, hasher_type3>{hasher_type3{}}, + {}, + {}, + cudf::detail::cuco_allocator{rmm::mr::polymorphic_allocator{}, stream}, + stream.value()}; + }; + if (!is_enabled_experimental) { return std::pair{false, make_map(0)}; } + // get all unique field node ids for utf8 decoding + auto num_keys = key_set.size(stream); + rmm::device_uvector keys(num_keys, stream); + key_set.retrieve_all(keys.data(), stream.value()); + + auto [num_unique_fields, found_keys] = + remapped_field_nodes_after_unicode_decode(d_input, d_tree, keys, stream); + + auto is_need_remap = num_unique_fields != num_keys; + if (!is_need_remap) { return std::pair{false, make_map(0)}; } + + // store to static_map with keys as field keys[index], and values as keys[found_keys[index]] + auto reverse_map = make_map(num_keys); + auto matching_keys_iter = thrust::make_permutation_iterator(keys.begin(), found_keys.begin()); + auto pair_iter = + thrust::make_zip_iterator(thrust::make_tuple(keys.begin(), matching_keys_iter)); + reverse_map.insert_async(pair_iter, pair_iter + num_keys, stream); + return std::pair{is_need_remap, std::move(reverse_map)}; + }; + auto [is_need_remap, reverse_map] = get_utf8_matched_field_nodes(); + auto const get_hash_value = - [key_set = key_set.ref(cuco::op::find)] __device__(auto node_id) -> size_type { + [key_set = key_set.ref(cuco::op::find), + is_need_remap = is_need_remap, + rm = reverse_map.ref(cuco::op::find)] __device__(auto node_id) -> size_type { auto const it = key_set.find(node_id); + if (it != key_set.end() and is_need_remap) { + auto const it2 = rm.find(*it); + return (it2 == rm.end()) ? size_type{0} : it2->second; + } return (it == key_set.end()) ? size_type{0} : *it; }; @@ -771,6 +902,8 @@ std::pair, rmm::device_uvector> hash_n * @param d_tree Tree representation of the JSON * @param is_array_of_arrays Whether the tree is an array of arrays * @param is_enabled_lines Whether the input is a line-delimited JSON + * @param is_enabled_experimental Whether the experimental feature is enabled such as + * utf8 field name support * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate the returned column's device memory * @return column_id, parent_column_id @@ -780,6 +913,7 @@ std::pair, rmm::device_uvector> gene tree_meta_t const& d_tree, bool is_array_of_arrays, bool is_enabled_lines, + bool is_enabled_experimental, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { @@ -793,7 +927,7 @@ std::pair, rmm::device_uvector> gene auto [col_id, unique_keys] = [&]() { // Convert node_category + field_name to node_type. rmm::device_uvector node_type = - hash_node_type_with_field_name(d_input, d_tree, stream); + hash_node_type_with_field_name(d_input, d_tree, is_enabled_experimental, stream); // hash entire path from node to root. return hash_node_path(d_tree.node_levels, @@ -948,12 +1082,13 @@ records_orient_tree_traversal(device_span d_input, tree_meta_t const& d_tree, bool is_array_of_arrays, bool is_enabled_lines, + bool is_enabled_experimental, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { CUDF_FUNC_RANGE(); - auto [new_col_id, new_parent_col_id] = - generate_column_id(d_input, d_tree, is_array_of_arrays, is_enabled_lines, stream, mr); + auto [new_col_id, new_parent_col_id] = generate_column_id( + d_input, d_tree, is_array_of_arrays, is_enabled_lines, is_enabled_experimental, stream, mr); auto row_offsets = compute_row_offsets( std::move(new_parent_col_id), d_tree, is_array_of_arrays, is_enabled_lines, stream, mr); diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp index 83f71e657a7..3d9a51833e0 100644 --- a/cpp/src/io/json/nested_json.hpp +++ b/cpp/src/io/json/nested_json.hpp @@ -185,6 +185,55 @@ struct device_json_column { } }; +namespace experimental { +/* + * @brief Sparse graph adjacency matrix stored in Compressed Sparse Row (CSR) format. + */ +struct compressed_sparse_row { + rmm::device_uvector row_idx; + rmm::device_uvector col_idx; +}; + +/* + * @brief Auxiliary column tree properties that are required to construct the device json + * column subtree, but not required for the final cudf column construction. + */ +struct column_tree_properties { + rmm::device_uvector categories; + rmm::device_uvector max_row_offsets; + rmm::device_uvector mapped_ids; +}; + +namespace detail { +/** + * @brief Reduce node tree into column tree by aggregating each property of column. + * + * @param node_tree Node tree representation of JSON string + * @param original_col_ids Column ids of nodes + * @param sorted_col_ids Sorted column ids of nodes + * @param ordered_node_ids Node ids of nodes sorted by column ids + * @param row_offsets Row offsets of nodes + * @param is_array_of_arrays Whether the tree is an array of arrays + * @param row_array_parent_col_id Column id of row array, if is_array_of_arrays is true + * @param stream CUDA stream used for device memory operations and kernel launches + * @return Tuple of compressed_sparse_row struct storing adjacency information of the column tree, + * and column_tree_properties struct storing properties of each node i.e. column category, max + * number of rows in the column, and column id + */ +CUDF_EXPORT +std::tuple reduce_to_column_tree( + tree_meta_t& node_tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, + bool is_array_of_arrays, + NodeIndexT row_array_parent_col_id, + rmm::cuda_stream_view stream); + +} // namespace detail +} // namespace experimental + namespace detail { // TODO: return device_uvector instead of passing pre-allocated memory @@ -267,6 +316,8 @@ tree_meta_t get_tree_representation(device_span tokens, * index, level, begin index, and end index in the input JSON string * @param is_array_of_arrays Whether the tree is an array of arrays * @param is_enabled_lines Whether the input is a line-delimited JSON + * @param is_enabled_experimental Whether to enable experimental features such as utf-8 field name + * support * @param stream The CUDA stream to which kernels are dispatched * @param mr Optional, resource with which to allocate * @return A tuple of the output column indices and the row offsets within each column for each node @@ -277,6 +328,7 @@ records_orient_tree_traversal(device_span d_input, tree_meta_t const& d_tree, bool is_array_of_arrays, bool is_enabled_lines, + bool is_enabled_experimental, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); @@ -314,12 +366,13 @@ get_array_children_indices(TreeDepthT row_array_children_level, * @return A tuple of column tree representation of JSON string, column ids of columns, and * max row offsets of columns */ +CUDF_EXPORT std::tuple, rmm::device_uvector> -reduce_to_column_tree(tree_meta_t& tree, - device_span original_col_ids, - device_span sorted_col_ids, - device_span ordered_node_ids, - device_span row_offsets, +reduce_to_column_tree(tree_meta_t const& tree, + device_span original_col_ids, + device_span sorted_col_ids, + device_span ordered_node_ids, + device_span row_offsets, bool is_array_of_arrays, NodeIndexT const row_array_parent_col_id, rmm::cuda_stream_view stream); @@ -343,14 +396,30 @@ reduce_to_column_tree(tree_meta_t& tree, * of child_offets and validity members of `d_json_column` */ void make_device_json_column(device_span input, - tree_meta_t& tree, - device_span col_ids, - device_span row_offsets, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, device_json_column& root, bool is_array_of_arrays, cudf::io::json_reader_options const& options, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr); + +namespace experimental { +/** + * @copydoc cudf::io::json::detail::make_device_json_column + */ +void make_device_json_column(device_span input, + tree_meta_t const& tree, + device_span col_ids, + device_span row_offsets, + device_json_column& root, + bool is_array_of_arrays, + cudf::io::json_reader_options const& options, + rmm::cuda_stream_view stream, + rmm::device_async_resource_ref mr); +} // namespace experimental + /** * @brief Retrieves the parse_options to be used for type inference and type casting * diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu index 2dd25a7b890..e1c1d2e3002 100644 --- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu +++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu @@ -1021,6 +1021,76 @@ struct group_key_func { } }; +// merges all the tdigests within each group. returns a table containing 2 columns: +// the sorted means and weights. +template +std::pair, rmm::device_uvector> generate_merged_centroids( + tdigest_column_view const& tdv, + GroupOffsetIter group_offsets, + size_type num_groups, + rmm::cuda_stream_view stream) +{ + auto temp_mr = cudf::get_current_device_resource_ref(); + + auto const total_merged_centroids = tdv.means().size(); + + // output is the merged centroids (means, weights) + rmm::device_uvector output_means(total_merged_centroids, stream, temp_mr); + rmm::device_uvector output_weights(total_merged_centroids, stream, temp_mr); + + // each group represents a collection of tdigest columns. each row is 1 tdigest. + // within each group, we want to sort all the centroids within all the tdigests + // in that group, using the means as the key. the "outer offsets" represent the indices of the + // tdigests, and the "inner offsets" represents the list of centroids for a particular tdigest. + // + // rows + // ---- centroid 0 --------- + // tdigest 0 centroid 1 + // ---- centroid 2 group 0 + // tdigest 1 centroid 3 + // ---- centroid 4 --------- + // tdigest 2 centroid 5 + // ---- centroid 6 group 1 + // tdigest 3 centroid 7 + // centroid 8 + // ---- centroid 9 -------- + auto inner_offsets = tdv.centroids().offsets(); + auto centroid_offsets = cudf::detail::make_counting_transform_iterator( + 0, + cuda::proclaim_return_type( + [group_offsets, inner_offsets = tdv.centroids().offsets().begin()] __device__( + size_type i) { return inner_offsets[group_offsets[i]]; })); + + // perform the sort using the means as the key + size_t temp_size; + CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(nullptr, + temp_size, + tdv.means().begin(), + output_means.begin(), + tdv.weights().begin(), + output_weights.begin(), + total_merged_centroids, + num_groups, + centroid_offsets, + centroid_offsets + 1, + stream.value())); + + rmm::device_buffer temp_mem(temp_size, stream, temp_mr); + CUDF_CUDA_TRY(cub::DeviceSegmentedSort::SortPairs(temp_mem.data(), + temp_size, + tdv.means().begin(), + output_means.begin(), + tdv.weights().begin(), + output_weights.begin(), + total_merged_centroids, + num_groups, + centroid_offsets, + centroid_offsets + 1, + stream.value())); + + return {std::move(output_means), std::move(output_weights)}; +} + template std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, HGroupOffsetIter h_outer_offsets, @@ -1032,59 +1102,6 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, rmm::cuda_stream_view stream, rmm::device_async_resource_ref mr) { - // thrust::merge and thrust::merge_by_key don't provide what we need. What we would need is an - // algorithm like a super-merge that takes two layers of keys: one which identifies the outer - // grouping of tdigests, and one which identifies the inner groupings of the tdigests within the - // outer groups. - // TODO: investigate replacing the iterative merge with a single stable_sort_by_key. - - // bring tdigest offsets back to the host - auto tdigest_offsets = tdv.centroids().offsets(); - std::vector h_inner_offsets(tdigest_offsets.size()); - cudaMemcpyAsync(h_inner_offsets.data(), - tdigest_offsets.begin(), - sizeof(size_type) * tdigest_offsets.size(), - cudaMemcpyDefault, - stream); - - stream.synchronize(); - - // extract all means and weights into a table - cudf::table_view tdigests_unsliced({tdv.means(), tdv.weights()}); - - // generate the merged (but not yet compressed) tdigests for each group. - std::vector> tdigests; - tdigests.reserve(num_groups); - std::transform(h_outer_offsets, - h_outer_offsets + num_groups, - std::next(h_outer_offsets), - std::back_inserter(tdigests), - [&](auto tdigest_start, auto tdigest_end) { - // the range of tdigests in this group - auto const num_tdigests = tdigest_end - tdigest_start; - - // slice each tdigest from the input - std::vector unmerged_tdigests; - unmerged_tdigests.reserve(num_tdigests); - auto offset_iter = std::next(h_inner_offsets.begin(), tdigest_start); - std::transform( - offset_iter, - offset_iter + num_tdigests, - std::next(offset_iter), - std::back_inserter(unmerged_tdigests), - [&](size_type start, size_type end) { - return cudf::detail::slice(tdigests_unsliced, {start, end}, stream); - }); - - // merge - return cudf::detail::merge(unmerged_tdigests, - {0}, - {order::ASCENDING}, - {}, - stream, - cudf::get_current_device_resource_ref()); - }); - // generate min and max values auto merged_min_col = cudf::make_numeric_column( data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr); @@ -1121,7 +1138,7 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, auto group_num_weights = cudf::detail::make_counting_transform_iterator( 0, group_num_weights_func{group_offsets, - tdigest_offsets.begin()}); + tdv.centroids().offsets().begin()}); thrust::replace_if(rmm::exec_policy(stream), merged_min_col->mutable_view().begin(), merged_min_col->mutable_view().end(), @@ -1135,29 +1152,33 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, group_is_empty{}, 0); - // concatenate all the merged tdigests back into one table. - std::vector tdigest_views; - tdigest_views.reserve(num_groups); - std::transform(tdigests.begin(), - tdigests.end(), - std::back_inserter(tdigest_views), - [](std::unique_ptr const& t) { return t->view(); }); - auto merged = - cudf::detail::concatenate(tdigest_views, stream, cudf::get_current_device_resource_ref()); + auto temp_mr = cudf::get_current_device_resource_ref(); + + // merge the centroids + auto [merged_means, merged_weights] = + generate_merged_centroids(tdv, group_offsets, num_groups, stream); + size_t const num_centroids = tdv.means().size(); + CUDF_EXPECTS(merged_means.size() == num_centroids, + "Unexpected number of centroids in merged result"); // generate cumulative weights - auto merged_weights = merged->get_column(1).view(); - auto cumulative_weights = cudf::make_numeric_column( - data_type{type_id::FLOAT64}, merged_weights.size(), mask_state::UNALLOCATED, stream); - auto keys = cudf::detail::make_counting_transform_iterator( - 0, - group_key_func{ - group_labels, tdigest_offsets.begin(), tdigest_offsets.size()}); + rmm::device_uvector cumulative_weights(merged_weights.size(), stream, temp_mr); + + // generate group keys for all centroids in the entire column + rmm::device_uvector group_keys(num_centroids, stream, temp_mr); + auto iter = thrust::make_counting_iterator(0); + auto inner_offsets = tdv.centroids().offsets(); + thrust::transform(rmm::exec_policy(stream), + iter, + iter + num_centroids, + group_keys.begin(), + group_key_func{ + group_labels, inner_offsets.begin(), inner_offsets.size()}); thrust::inclusive_scan_by_key(rmm::exec_policy(stream), - keys, - keys + cumulative_weights->size(), - merged_weights.begin(), - cumulative_weights->mutable_view().begin()); + group_keys.begin(), + group_keys.begin() + num_centroids, + merged_weights.begin(), + cumulative_weights.begin()); auto const delta = max_centroids; @@ -1166,37 +1187,32 @@ std::unique_ptr merge_tdigests(tdigest_column_view const& tdv, delta, num_groups, nearest_value_centroid_weights{ - cumulative_weights->view().begin(), - group_offsets, - tdigest_offsets.begin()}, - centroid_group_info{cumulative_weights->view().begin(), - group_offsets, - tdigest_offsets.begin()}, + cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, + centroid_group_info{ + cumulative_weights.begin(), group_offsets, inner_offsets.begin()}, cumulative_centroid_weight{ - cumulative_weights->view().begin(), + cumulative_weights.begin(), group_labels, group_offsets, - {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + {inner_offsets.begin(), static_cast(inner_offsets.size())}}, false, stream, mr); // input centroid values auto centroids = cudf::detail::make_counting_transform_iterator( - 0, - make_weighted_centroid{merged->get_column(0).view().begin(), - merged_weights.begin()}); + 0, make_weighted_centroid{merged_means.begin(), merged_weights.begin()}); // compute the tdigest return compute_tdigests( delta, centroids, - centroids + merged->num_rows(), + centroids + merged_means.size(), cumulative_centroid_weight{ - cumulative_weights->view().begin(), + cumulative_weights.begin(), group_labels, group_offsets, - {tdigest_offsets.begin(), static_cast(tdigest_offsets.size())}}, + {inner_offsets.begin(), static_cast(inner_offsets.size())}}, std::move(merged_min_col), std::move(merged_max_col), group_cluster_wl, diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 288fa84a73d..b67d922d377 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -329,6 +329,7 @@ ConfigureTest(NESTED_JSON_TEST io/json/nested_json_test.cpp io/json/json_tree.cp ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp) ConfigureTest(JSON_QUOTE_NORMALIZATION io/json/json_quote_normalization_test.cpp) ConfigureTest(JSON_WHITESPACE_NORMALIZATION io/json/json_whitespace_normalization_test.cu) +ConfigureTest(JSON_TREE_CSR io/json/json_tree_csr.cu) ConfigureTest( DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp GPUS 1 diff --git a/cpp/tests/io/json/json_test.cpp b/cpp/tests/io/json/json_test.cpp index 48bc982d0e3..68ec255b39d 100644 --- a/cpp/tests/io/json/json_test.cpp +++ b/cpp/tests/io/json/json_test.cpp @@ -2856,6 +2856,59 @@ TEST_F(JsonReaderTest, JSONMixedTypeChildren) } } +TEST_F(JsonReaderTest, MixedTypesWithSchema) +{ + std::string data = "{\"data\": {\"A\": 0, \"B\": 1}}\n{\"data\": [1,0]}\n"; + + std::map data_types; + std::map child_types; + child_types.insert( + std::pair{"element", cudf::io::schema_element{cudf::data_type{cudf::type_id::STRING, 0}, {}}}); + data_types.insert(std::pair{ + "data", cudf::io::schema_element{cudf::data_type{cudf::type_id::LIST, 0}, child_types}}); + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .dtypes(data_types) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .normalize_single_quotes(true) + .normalize_whitespace(true) + .mixed_types_as_string(true) + .experimental(true) + .keep_quotes(true) + .lines(true); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 2); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::LIST); + EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::STRING); +} + +TEST_F(JsonReaderTest, UnicodeFieldname) +{ + // unicode at nested and leaf levels + std::string data = R"({"data": {"a": 0, "b c": 1}} + {"data": {"\u0061": 2, "\u0062\tc": 3}} + {"d\u0061ta": {"a": 4}})"; + + cudf::io::json_reader_options in_options = + cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()}) + .recovery_mode(cudf::io::json_recovery_mode_t::RECOVER_WITH_NULL) + .experimental(true) + .lines(true); + cudf::io::table_with_metadata result = cudf::io::read_json(in_options); + EXPECT_EQ(result.tbl->num_columns(), 1); + EXPECT_EQ(result.tbl->num_rows(), 3); + EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRUCT); + EXPECT_EQ(result.tbl->get_column(0).num_children(), 2); + EXPECT_EQ(result.tbl->get_column(0).child(0).type().id(), cudf::type_id::INT64); + EXPECT_EQ(result.tbl->get_column(0).child(1).type().id(), cudf::type_id::INT64); + EXPECT_EQ(result.metadata.schema_info.at(0).name, "data"); + EXPECT_EQ(result.metadata.schema_info.at(0).children.at(0).name, "a"); + EXPECT_EQ(result.metadata.schema_info.at(0).children.at(1).name, "b\tc"); + EXPECT_EQ(result.metadata.schema_info.at(0).children.size(), 2); +} + TEST_F(JsonReaderTest, JsonDtypeSchema) { std::string data = R"( diff --git a/cpp/tests/io/json/json_tree.cpp b/cpp/tests/io/json/json_tree.cpp index 875cc467b6a..15682c6ae6b 100644 --- a/cpp/tests/io/json/json_tree.cpp +++ b/cpp/tests/io/json/json_tree.cpp @@ -889,6 +889,7 @@ TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal) gpu_tree, is_array_of_arrays, json_lines, + false, stream, cudf::get_current_device_resource_ref()); #if LIBCUDF_JSON_DEBUG_DUMP diff --git a/cpp/tests/io/json/json_tree_csr.cu b/cpp/tests/io/json/json_tree_csr.cu new file mode 100644 index 00000000000..f988ae24b38 --- /dev/null +++ b/cpp/tests/io/json/json_tree_csr.cu @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2024, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "io/json/nested_json.hpp" + +#include +#include + +#include +#include +#include +#include +#include + +#include +#include +#include + +#include +#include + +#include + +namespace cuio_json = cudf::io::json; + +struct h_tree_meta_t { + std::vector node_categories; + std::vector parent_node_ids; + std::vector node_range_begin; + std::vector node_range_end; +}; + +struct h_column_tree { + // position of nnzs + std::vector row_idx; + std::vector col_idx; + // node properties + std::vector categories; + std::vector column_ids; +}; + +// debug printing +template +void print(cudf::host_span vec, std::string name) +{ + std::cout << name << " = "; + for (auto e : vec) { + std::cout << e << " "; + } + std::cout << std::endl; +} + +bool check_equality(cuio_json::tree_meta_t& d_a, + cudf::device_span d_a_max_row_offsets, + cuio_json::experimental::compressed_sparse_row& d_b_csr, + cuio_json::experimental::column_tree_properties& d_b_ctp, + rmm::cuda_stream_view stream) +{ + // convert from tree_meta_t to column_tree_csr + stream.synchronize(); + + h_tree_meta_t a{cudf::detail::make_std_vector_async(d_a.node_categories, stream), + cudf::detail::make_std_vector_async(d_a.parent_node_ids, stream), + cudf::detail::make_std_vector_async(d_a.node_range_begin, stream), + cudf::detail::make_std_vector_async(d_a.node_range_end, stream)}; + + h_column_tree b{cudf::detail::make_std_vector_async(d_b_csr.row_idx, stream), + cudf::detail::make_std_vector_async(d_b_csr.col_idx, stream), + cudf::detail::make_std_vector_async(d_b_ctp.categories, stream), + cudf::detail::make_std_vector_async(d_b_ctp.mapped_ids, stream)}; + + auto a_max_row_offsets = cudf::detail::make_std_vector_async(d_a_max_row_offsets, stream); + auto b_max_row_offsets = cudf::detail::make_std_vector_async(d_b_ctp.max_row_offsets, stream); + + stream.synchronize(); + + auto num_nodes = a.parent_node_ids.size(); + if (num_nodes > 1) { + if (b.row_idx.size() != num_nodes + 1) { return false; } + + for (auto pos = b.row_idx[0]; pos < b.row_idx[1]; pos++) { + auto v = b.col_idx[pos]; + if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[0]) { return false; } + } + for (size_t u = 1; u < num_nodes; u++) { + auto v = b.col_idx[b.row_idx[u]]; + if (a.parent_node_ids[b.column_ids[u]] != b.column_ids[v]) { return false; } + + for (auto pos = b.row_idx[u] + 1; pos < b.row_idx[u + 1]; pos++) { + v = b.col_idx[pos]; + if (a.parent_node_ids[b.column_ids[v]] != b.column_ids[u]) { return false; } + } + } + for (size_t u = 0; u < num_nodes; u++) { + if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; } + } + for (size_t u = 0; u < num_nodes; u++) { + if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; } + } + } else if (num_nodes == 1) { + if (b.row_idx.size() != num_nodes + 1) { return false; } + + if (b.row_idx[0] != 0 || b.row_idx[1] != 1) return false; + if (!b.col_idx.empty()) return false; + for (size_t u = 0; u < num_nodes; u++) { + if (a.node_categories[b.column_ids[u]] != b.categories[u]) { return false; } + } + + for (size_t u = 0; u < num_nodes; u++) { + if (a_max_row_offsets[b.column_ids[u]] != b_max_row_offsets[u]) { return false; } + } + } + return true; +} + +void run_test(std::string const& input, bool enable_lines = true) +{ + auto const stream = cudf::get_default_stream(); + cudf::string_scalar d_scalar(input, true, stream); + auto d_input = cudf::device_span{d_scalar.data(), + static_cast(d_scalar.size())}; + + cudf::io::json_reader_options options{}; + options.enable_lines(enable_lines); + options.enable_mixed_types_as_string(true); + + // Parse the JSON and get the token stream + auto const [tokens_gpu, token_indices_gpu] = cudf::io::json::detail::get_token_stream( + d_input, options, stream, cudf::get_current_device_resource_ref()); + + // Get the JSON's tree representation + auto gpu_tree = + cuio_json::detail::get_tree_representation(tokens_gpu, + token_indices_gpu, + options.is_enabled_mixed_types_as_string(), + stream, + cudf::get_current_device_resource_ref()); + + bool const is_array_of_arrays = [&]() { + std::array h_node_categories = {cuio_json::NC_ERR, cuio_json::NC_ERR}; + auto const size_to_copy = std::min(size_t{2}, gpu_tree.node_categories.size()); + CUDF_CUDA_TRY(cudaMemcpyAsync(h_node_categories.data(), + gpu_tree.node_categories.data(), + sizeof(cuio_json::node_t) * size_to_copy, + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + if (options.is_enabled_lines()) return h_node_categories[0] == cuio_json::NC_LIST; + return h_node_categories[0] == cuio_json::NC_LIST and + h_node_categories[1] == cuio_json::NC_LIST; + }(); + + auto tup = + cuio_json::detail::records_orient_tree_traversal(d_input, + gpu_tree, + is_array_of_arrays, + options.is_enabled_lines(), + false, + stream, + rmm::mr::get_current_device_resource()); + auto& gpu_col_id = std::get<0>(tup); + auto& gpu_row_offsets = std::get<1>(tup); + + auto const num_nodes = gpu_col_id.size(); + rmm::device_uvector sorted_col_ids(gpu_col_id.size(), stream); // make a copy + thrust::copy( + rmm::exec_policy(stream), gpu_col_id.begin(), gpu_col_id.end(), sorted_col_ids.begin()); + + // sort by {col_id} on {node_ids} stable + rmm::device_uvector node_ids(gpu_col_id.size(), stream); + thrust::sequence(rmm::exec_policy(stream), node_ids.begin(), node_ids.end()); + thrust::stable_sort_by_key( + rmm::exec_policy(stream), sorted_col_ids.begin(), sorted_col_ids.end(), node_ids.begin()); + + cudf::size_type const row_array_parent_col_id = [&]() { + cudf::size_type value = cuio_json::parent_node_sentinel; + auto const list_node_index = options.is_enabled_lines() ? 0 : 1; + CUDF_CUDA_TRY(cudaMemcpyAsync(&value, + gpu_col_id.data() + list_node_index, + sizeof(cudf::size_type), + cudaMemcpyDefault, + stream.value())); + stream.synchronize(); + return value; + }(); + + auto [d_column_tree, d_unique_col_ids, d_max_row_offsets] = + cudf::io::json::detail::reduce_to_column_tree(gpu_tree, + gpu_col_id, + sorted_col_ids, + node_ids, + gpu_row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + + auto [d_column_tree_csr, d_column_tree_properties] = + cudf::io::json::experimental::detail::reduce_to_column_tree(gpu_tree, + gpu_col_id, + sorted_col_ids, + node_ids, + gpu_row_offsets, + is_array_of_arrays, + row_array_parent_col_id, + stream); + + auto iseq = check_equality( + d_column_tree, d_max_row_offsets, d_column_tree_csr, d_column_tree_properties, stream); + // assert equality between csr and meta formats + ASSERT_TRUE(iseq); +} + +struct JsonColumnTreeTests : public cudf::test::BaseFixture {}; + +TEST_F(JsonColumnTreeTests, JSONL_Small) +{ + std::string const input = + R"( {} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"; // Prepare input & output buffers + run_test(input); +} + +TEST_F(JsonColumnTreeTests, JSONL_Large) +{ + std::string const input = + R"( {} + {} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} + { "a": { "y" : 6, "z": [] }} + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"; + run_test(input); +} + +TEST_F(JsonColumnTreeTests, JSONL_ListofStruct) +{ + std::string const input = R"( + { "Root": { "Key": [ { "EE": "A" } ] } } + { "Root": { "Key": { } } } + { "Root": { "Key": [{ "YY": 1}] } } + )"; + run_test(input); +} + +TEST_F(JsonColumnTreeTests, JSONL_MissingEntries) +{ + std::string json_stringl = R"( + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true} + {"a": 1, "b": {"0": "abc" }, "c": false} + {"a": 1, "b": {}} + {"a": 1, "c": null} + )"; + run_test(json_stringl); +} + +TEST_F(JsonColumnTreeTests, JSONL_MoreMissingEntries) +{ + std::string json_stringl = R"( + { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 } + { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 } + { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 } + )"; + run_test(json_stringl); +} + +TEST_F(JsonColumnTreeTests, JSONL_StillMoreMissingEntries) +{ + std::string json_stringl = R"( + { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 } + { "foo1": ["123","456"], "bar": 123 } + { "foo2": { "b": 5 }, "car": 456 } + { "foo1": [1,2,3], "bar": 123 } + { "foo2": { "a": 1 }, "bar": 456 } + )"; + run_test(json_stringl); +} + +TEST_F(JsonColumnTreeTests, JSON_MissingEntries) +{ + std::string json_string = R"([ + {"a": 1, "b": {"0": "abc", "1": [-1.]}, "c": true}, + {"a": 1, "b": {"0": "abc" }, "c": false}, + {"a": 1, "b": {}}, + {"a": 1, "c": null} + ])"; + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSON_StructOfStructs) +{ + std::string json_string = + R"([ + {}, + { "a": { "y" : 6, "z": [] }}, + { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} + ])"; // Prepare input & output buffers + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSONL_ArrayOfArrays_NestedList) +{ + std::string json_string = + R"([123, [1,2,3]] + [456, null, { "a": 1 }])"; + run_test(json_string); +} + +TEST_F(JsonColumnTreeTests, JSON_ArrayofArrays_NestedList) +{ + std::string json_string = R"([[[1,2,3], null, 123], + [null, { "a": 1 }, 456 ]])"; + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSON_CornerCase_Empty) +{ + std::string json_string = R"([])"; + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSONL_CornerCase_List) +{ + std::string json_string = R"([123])"; + run_test(json_string, true); +} + +TEST_F(JsonColumnTreeTests, JSON_CornerCase_EmptyNestedList) +{ + std::string json_string = R"([[[]]])"; + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSON_CornerCase_EmptyNestedLists) +{ + std::string json_string = R"([[], [], []])"; + run_test(json_string, false); +} + +TEST_F(JsonColumnTreeTests, JSONL_CornerCase_ListofLists) +{ + std::string json_string = R"([[1, 2, 3], [4, 5, null], []])"; + run_test(json_string, true); +} + +TEST_F(JsonColumnTreeTests, JSONL_CornerCase_EmptyListOfLists) +{ + std::string json_string = R"([[]])"; + run_test(json_string, true); +} diff --git a/dependencies.yaml b/dependencies.yaml index 339adbc5ff9..6909eb7168d 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -25,6 +25,7 @@ files: - rapids_build_setuptools - run_common - run_cudf + - run_cudf_polars - run_pylibcudf - run_dask_cudf - run_custreamz diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst index d6f8cd2a1ff..e21536e2e97 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/index.rst @@ -25,6 +25,7 @@ This page provides API documentation for pylibcudf. lists merge null_mask + partitioning quantiles reduce replace @@ -38,6 +39,7 @@ This page provides API documentation for pylibcudf. table traits transform + transpose types unary diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst index c8933981736..53638f071cc 100644 --- a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/index.rst @@ -19,3 +19,4 @@ I/O Functions csv json parquet + timezone diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst new file mode 100644 index 00000000000..20c1ffc2e93 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/io/timezone.rst @@ -0,0 +1,6 @@ +======== +Timezone +======== + +.. automodule:: pylibcudf.io.timezone + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst new file mode 100644 index 00000000000..6951dbecca0 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/partitioning.rst @@ -0,0 +1,6 @@ +============ +partitioning +============ + +.. automodule:: pylibcudf.partitioning + :members: diff --git a/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst b/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst new file mode 100644 index 00000000000..6241295e770 --- /dev/null +++ b/docs/cudf/source/user_guide/api_docs/pylibcudf/transpose.rst @@ -0,0 +1,6 @@ +========= +transpose +========= + +.. automodule:: pylibcudf.transpose + :members: diff --git a/docs/dask_cudf/source/best_practices.rst b/docs/dask_cudf/source/best_practices.rst index 142124163af..83039f86fed 100644 --- a/docs/dask_cudf/source/best_practices.rst +++ b/docs/dask_cudf/source/best_practices.rst @@ -252,6 +252,15 @@ result in a simple 1-to-1 mapping between files and output partitions. correspond to a reasonable partition size, use ``blocksize=None`` to avoid unnecessary metadata collection. +.. note:: + When reading from remote storage (e.g. S3 and GCS), performance will + likely improve with ``filesystem="arrow"``. When this option is set, + PyArrow will be used to perform IO on multiple CPU threads. Please be + aware that this feature is experimental, and behavior may change in + the future (without deprecation). Do not pass in ``blocksize`` or + ``aggregate_files`` when this feature is used. Instead, set the + ``"dataframe.parquet.minimum-partition-size"`` config to control + file aggregation. Use :func:`from_map` ~~~~~~~~~~~~~~~~~~~~ diff --git a/docs/dask_cudf/source/index.rst b/docs/dask_cudf/source/index.rst index 23ca7e49753..6eb755d7854 100644 --- a/docs/dask_cudf/source/index.rst +++ b/docs/dask_cudf/source/index.rst @@ -40,9 +40,10 @@ Using Dask cuDF The Dask DataFrame API (Recommended) ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ -Simply use the `Dask configuration `__ system to -set the ``"dataframe.backend"`` option to ``"cudf"``. From Python, -this can be achieved like so:: +Simply use the `Dask configuration +`__ +system to set the ``"dataframe.backend"`` option to ``"cudf"``. +From Python, this can be achieved like so:: import dask diff --git a/java/src/main/java/ai/rapids/cudf/JSONOptions.java b/java/src/main/java/ai/rapids/cudf/JSONOptions.java index 2bb74c3e3b1..e41cc15712f 100644 --- a/java/src/main/java/ai/rapids/cudf/JSONOptions.java +++ b/java/src/main/java/ai/rapids/cudf/JSONOptions.java @@ -39,6 +39,7 @@ public final class JSONOptions extends ColumnFilterOptions { private final boolean allowNonNumericNumbers; private final boolean allowUnquotedControlChars; private final boolean cudfPruneSchema; + private final boolean experimental; private final byte lineDelimiter; private JSONOptions(Builder builder) { @@ -55,6 +56,7 @@ private JSONOptions(Builder builder) { allowNonNumericNumbers = builder.allowNonNumericNumbers; allowUnquotedControlChars = builder.allowUnquotedControlChars; cudfPruneSchema = builder.cudfPruneSchema; + experimental = builder.experimental; lineDelimiter = builder.lineDelimiter; } @@ -111,6 +113,10 @@ public boolean unquotedControlChars() { return allowUnquotedControlChars; } + public boolean experimental() { + return experimental; + } + @Override String[] getIncludeColumnNames() { throw new UnsupportedOperationException("JSON reader didn't support column prune"); @@ -136,6 +142,7 @@ public static final class Builder extends ColumnFilterOptions.Builder(line_delimiter)) .strict_validation(strict_validation) + .experimental(experimental) .keep_quotes(keep_quotes) .prune_columns(false); if (strict_validation) { @@ -1680,6 +1682,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, jboolean allow_leading_zeros, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, + jboolean experimental, jbyte line_delimiter) { JNI_NULL_CHECK(env, buffer, "buffer cannot be null", 0); @@ -1705,6 +1708,7 @@ Java_ai_rapids_cudf_Table_readAndInferJSON(JNIEnv* env, .strict_validation(strict_validation) .mixed_types_as_string(mixed_types_as_string) .prune_columns(false) + .experimental(experimental) .delimiter(static_cast(line_delimiter)) .keep_quotes(keep_quotes); if (strict_validation) { @@ -1821,6 +1825,7 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, jboolean prune_columns, + jboolean experimental, jbyte line_delimiter, jlong ds_handle) { @@ -1859,7 +1864,8 @@ Java_ai_rapids_cudf_Table_readJSONFromDataSource(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns); + .prune_columns(prune_columns) + .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) @@ -1920,6 +1926,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, jboolean allow_nonnumeric_numbers, jboolean allow_unquoted_control, jboolean prune_columns, + jboolean experimental, jbyte line_delimiter) { bool read_buffer = true; @@ -1972,7 +1979,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Table_readJSON(JNIEnv* env, .delimiter(static_cast(line_delimiter)) .strict_validation(strict_validation) .keep_quotes(keep_quotes) - .prune_columns(prune_columns); + .prune_columns(prune_columns) + .experimental(experimental); if (strict_validation) { opts.numeric_leading_zeros(allow_leading_zeros) .nonnumeric_numbers(allow_nonnumeric_numbers) diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx index 48f75b12a73..9b7ab0888d2 100644 --- a/python/cudf/cudf/_lib/hash.pyx +++ b/python/cudf/cudf/_lib/hash.pyx @@ -3,11 +3,8 @@ from cudf.core.buffer import acquire_spill_lock from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair from libcpp.utility cimport move -from libcpp.vector cimport vector -cimport pylibcudf.libcudf.types as libcudf_types from pylibcudf.libcudf.column.column cimport column from pylibcudf.libcudf.hash cimport ( md5, @@ -19,37 +16,23 @@ from pylibcudf.libcudf.hash cimport ( sha512, xxhash_64, ) -from pylibcudf.libcudf.partitioning cimport ( - hash_partition as cpp_hash_partition, -) -from pylibcudf.libcudf.table.table cimport table from pylibcudf.libcudf.table.table_view cimport table_view from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns +from cudf._lib.utils cimport table_view_from_columns + +import pylibcudf as plc @acquire_spill_lock() -def hash_partition(list source_columns, object columns_to_hash, +def hash_partition(list source_columns, list columns_to_hash, int num_partitions): - cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash - cdef int c_num_partitions = num_partitions - cdef table_view c_source_view = table_view_from_columns(source_columns) - - cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result - with nogil: - c_result = move( - cpp_hash_partition( - c_source_view, - c_columns_to_hash, - c_num_partitions - ) - ) - - return ( - columns_from_unique_ptr(move(c_result.first)), - list(c_result.second) + plc_table, offsets = plc.partitioning.hash_partition( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), + columns_to_hash, + num_partitions ) + return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets @acquire_spill_lock() diff --git a/python/cudf/cudf/_lib/io/utils.pxd b/python/cudf/cudf/_lib/io/utils.pxd index 1938f00c179..76a6e32fde0 100644 --- a/python/cudf/cudf/_lib/io/utils.pxd +++ b/python/cudf/cudf/_lib/io/utils.pxd @@ -21,6 +21,10 @@ cdef add_df_col_struct_names( df, child_names_dict ) +cdef update_col_struct_field_names( + Column col, + child_names +) cdef update_struct_field_names( table, vector[column_name_info]& schema_info) diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx index adeba6fffb1..f88c48ce989 100644 --- a/python/cudf/cudf/_lib/orc.pyx +++ b/python/cudf/cudf/_lib/orc.pyx @@ -1,8 +1,5 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -import cudf -from cudf.core.buffer import acquire_spill_lock - from libc.stdint cimport int64_t from libcpp cimport bool, int from libcpp.map cimport map @@ -11,187 +8,43 @@ from libcpp.string cimport string from libcpp.utility cimport move from libcpp.vector cimport vector -import datetime from collections import OrderedDict -cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view - try: import ujson as json except ImportError: import json cimport pylibcudf.libcudf.io.types as cudf_io_types +cimport pylibcudf.libcudf.lists.lists_column_view as cpp_lists_column_view from pylibcudf.libcudf.io.data_sink cimport data_sink from pylibcudf.libcudf.io.orc cimport ( chunked_orc_writer_options, orc_chunked_writer, - orc_reader_options, orc_writer_options, - read_orc as libcudf_read_orc, write_orc as libcudf_write_orc, ) -from pylibcudf.libcudf.io.orc_metadata cimport ( - binary_statistics, - bucket_statistics, - column_statistics, - date_statistics, - decimal_statistics, - double_statistics, - integer_statistics, - no_statistics, - parsed_orc_statistics, - read_parsed_orc_statistics as libcudf_read_parsed_orc_statistics, - statistics_type, - string_statistics, - timestamp_statistics, -) from pylibcudf.libcudf.io.types cimport ( column_in_metadata, compression_type, sink_info, - source_info, table_input_metadata, - table_with_metadata, ) from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.types cimport data_type, size_type, type_id -from pylibcudf.variant cimport get_if as std_get_if, holds_alternative from cudf._lib.column cimport Column -from cudf._lib.io.utils cimport ( - make_sink_info, - make_source_info, - update_column_struct_field_names, -) +from cudf._lib.io.utils cimport make_sink_info, update_col_struct_field_names +from cudf._lib.utils cimport data_from_pylibcudf_io, table_view_from_table -from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES - -from cudf._lib.types cimport underlying_type_t_type_id -from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table +import pylibcudf as plc +import cudf +from cudf._lib.types import SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES from cudf._lib.utils import _index_level_name, generate_pandas_metadata +from cudf.core.buffer import acquire_spill_lock -cdef _parse_column_type_statistics(column_statistics stats): - # Initialize stats to return and parse stats blob - column_stats = {} - - if stats.number_of_values.has_value(): - column_stats["number_of_values"] = stats.number_of_values.value() - - if stats.has_null.has_value(): - column_stats["has_null"] = stats.has_null.value() - - cdef statistics_type type_specific_stats = stats.type_specific_stats - - cdef integer_statistics* int_stats - cdef double_statistics* dbl_stats - cdef string_statistics* str_stats - cdef bucket_statistics* bucket_stats - cdef decimal_statistics* dec_stats - cdef date_statistics* date_stats - cdef binary_statistics* bin_stats - cdef timestamp_statistics* ts_stats - - if holds_alternative[no_statistics](type_specific_stats): - return column_stats - elif int_stats := std_get_if[integer_statistics](&type_specific_stats): - if int_stats.minimum.has_value(): - column_stats["minimum"] = int_stats.minimum.value() - else: - column_stats["minimum"] = None - if int_stats.maximum.has_value(): - column_stats["maximum"] = int_stats.maximum.value() - else: - column_stats["maximum"] = None - if int_stats.sum.has_value(): - column_stats["sum"] = int_stats.sum.value() - else: - column_stats["sum"] = None - elif dbl_stats := std_get_if[double_statistics](&type_specific_stats): - if dbl_stats.minimum.has_value(): - column_stats["minimum"] = dbl_stats.minimum.value() - else: - column_stats["minimum"] = None - if dbl_stats.maximum.has_value(): - column_stats["maximum"] = dbl_stats.maximum.value() - else: - column_stats["maximum"] = None - if dbl_stats.sum.has_value(): - column_stats["sum"] = dbl_stats.sum.value() - else: - column_stats["sum"] = None - elif str_stats := std_get_if[string_statistics](&type_specific_stats): - if str_stats.minimum.has_value(): - column_stats["minimum"] = str_stats.minimum.value().decode("utf-8") - else: - column_stats["minimum"] = None - if str_stats.maximum.has_value(): - column_stats["maximum"] = str_stats.maximum.value().decode("utf-8") - else: - column_stats["maximum"] = None - if str_stats.sum.has_value(): - column_stats["sum"] = str_stats.sum.value() - else: - column_stats["sum"] = None - elif bucket_stats := std_get_if[bucket_statistics](&type_specific_stats): - column_stats["true_count"] = bucket_stats.count[0] - column_stats["false_count"] = ( - column_stats["number_of_values"] - - column_stats["true_count"] - ) - elif dec_stats := std_get_if[decimal_statistics](&type_specific_stats): - if dec_stats.minimum.has_value(): - column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8") - else: - column_stats["minimum"] = None - if dec_stats.maximum.has_value(): - column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8") - else: - column_stats["maximum"] = None - if dec_stats.sum.has_value(): - column_stats["sum"] = dec_stats.sum.value().decode("utf-8") - else: - column_stats["sum"] = None - elif date_stats := std_get_if[date_statistics](&type_specific_stats): - if date_stats.minimum.has_value(): - column_stats["minimum"] = datetime.datetime.fromtimestamp( - datetime.timedelta(date_stats.minimum.value()).total_seconds(), - datetime.timezone.utc, - ) - else: - column_stats["minimum"] = None - if date_stats.maximum.has_value(): - column_stats["maximum"] = datetime.datetime.fromtimestamp( - datetime.timedelta(date_stats.maximum.value()).total_seconds(), - datetime.timezone.utc, - ) - else: - column_stats["maximum"] = None - elif bin_stats := std_get_if[binary_statistics](&type_specific_stats): - if bin_stats.sum.has_value(): - column_stats["sum"] = bin_stats.sum.value() - else: - column_stats["sum"] = None - elif ts_stats := std_get_if[timestamp_statistics](&type_specific_stats): - # Before ORC-135, the local timezone offset was included and they were - # stored as minimum and maximum. After ORC-135, the timestamp is - # adjusted to UTC before being converted to milliseconds and stored - # in minimumUtc and maximumUtc. - # TODO: Support minimum and maximum by reading writer's local timezone - if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value(): - column_stats["minimum"] = datetime.datetime.fromtimestamp( - ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc - ) - column_stats["maximum"] = datetime.datetime.fromtimestamp( - ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc - ) - else: - raise ValueError("Unsupported statistics type") - return column_stats - - +# TODO: Consider inlining this function since it seems to only be used in one place. cpdef read_parsed_orc_statistics(filepath_or_buffer): """ Cython function to call into libcudf API, see `read_parsed_orc_statistics`. @@ -201,25 +54,13 @@ cpdef read_parsed_orc_statistics(filepath_or_buffer): cudf.io.orc.read_orc_statistics """ - cdef parsed_orc_statistics parsed = ( - libcudf_read_parsed_orc_statistics(make_source_info([filepath_or_buffer])) + parsed = ( + plc.io.orc.read_parsed_orc_statistics( + plc.io.SourceInfo([filepath_or_buffer]) + ) ) - cdef vector[column_statistics] file_stats = parsed.file_stats - cdef vector[vector[column_statistics]] stripes_stats = parsed.stripes_stats - - parsed_file_stats = [ - _parse_column_type_statistics(file_stats[column_index]) - for column_index in range(file_stats.size()) - ] - - parsed_stripes_stats = [ - [_parse_column_type_statistics(stripes_stats[stripe_index][column_index]) - for column_index in range(stripes_stats[stripe_index].size())] - for stripe_index in range(stripes_stats.size()) - ] - - return parsed.column_names, parsed_file_stats, parsed_stripes_stats + return parsed.column_names, parsed.file_stats, parsed.stripes_stats cpdef read_orc(object filepaths_or_buffers, @@ -235,36 +76,34 @@ cpdef read_orc(object filepaths_or_buffers, See Also -------- cudf.read_orc + + Notes + ----- + Currently this function only considers the metadata of the first file in the list of + filepaths_or_buffers. """ - cdef orc_reader_options c_orc_reader_options = make_orc_reader_options( - filepaths_or_buffers, + + if columns is not None: + columns = [str(col) for col in columns] + + tbl_w_meta = plc.io.orc.read_orc( + plc.io.SourceInfo(filepaths_or_buffers), columns, - stripes or [], + stripes, get_skiprows_arg(skip_rows), get_num_rows_arg(num_rows), - ( - type_id.EMPTY - if timestamp_type is None else - ( - ( - SUPPORTED_NUMPY_TO_LIBCUDF_TYPES[ - cudf.dtype(timestamp_type) - ] - ) - ) - ), use_index, + plc.types.DataType( + SUPPORTED_NUMPY_TO_PYLIBCUDF_TYPES[ + cudf.dtype(timestamp_type) + ] + ) ) - cdef table_with_metadata c_result - cdef size_type nrows + names = tbl_w_meta.column_names(include_children=False) - with nogil: - c_result = move(libcudf_read_orc(c_orc_reader_options)) - - names = [info.name.decode() for info in c_result.metadata.schema_info] actual_index_names, col_names, is_range_index, reset_index_name, \ - range_idx = _get_index_from_metadata(c_result.metadata.user_data, + range_idx = _get_index_from_metadata(tbl_w_meta.per_file_user_data, names, skip_rows, num_rows) @@ -272,11 +111,11 @@ cpdef read_orc(object filepaths_or_buffers, if columns is not None and (isinstance(columns, list) and len(columns) == 0): # When `columns=[]`, index needs to be # established, but not the columns. - nrows = c_result.tbl.get()[0].view().num_rows() + nrows = tbl_w_meta.tbl.num_rows() return {}, cudf.RangeIndex(nrows) - data, index = data_from_unique_ptr( - move(c_result.tbl), + data, index = data_from_pylibcudf_io( + tbl_w_meta, col_names if columns is None else names, actual_index_names ) @@ -286,11 +125,13 @@ cpdef read_orc(object filepaths_or_buffers, elif reset_index_name: index.names = [None] * len(index.names) + child_name_values = tbl_w_meta.child_names.values() + data = { - name: update_column_struct_field_names( - col, c_result.metadata.schema_info[i] + name: update_col_struct_field_names( + col, child_names ) - for i, (name, col) in enumerate(data.items()) + for (name, col), child_names in zip(data.items(), child_name_values) } return data, index @@ -313,32 +154,35 @@ cdef compression_type _get_comp_type(object compression): raise ValueError(f"Unsupported `compression` type {compression}") cdef tuple _get_index_from_metadata( - map[string, string] user_data, + vector[map[string, string]] user_data, object names, object skip_rows, object num_rows): - json_str = user_data[b'pandas'].decode('utf-8') + meta = None index_col = None is_range_index = False reset_index_name = False range_idx = None - if json_str != "": - meta = json.loads(json_str) - if 'index_columns' in meta and len(meta['index_columns']) > 0: - index_col = meta['index_columns'] - if isinstance(index_col[0], dict) and \ - index_col[0]['kind'] == 'range': - is_range_index = True - else: - index_col_names = OrderedDict() - for idx_col in index_col: - for c in meta['columns']: - if c['field_name'] == idx_col: - index_col_names[idx_col] = \ - c['name'] or c['field_name'] - if c['name'] is None: - reset_index_name = True + + if user_data.size() > 0: + json_str = user_data[0][b'pandas'].decode('utf-8') + if json_str != "": + meta = json.loads(json_str) + if 'index_columns' in meta and len(meta['index_columns']) > 0: + index_col = meta['index_columns'] + if isinstance(index_col[0], dict) and \ + index_col[0]['kind'] == 'range': + is_range_index = True + else: + index_col_names = OrderedDict() + for idx_col in index_col: + for c in meta['columns']: + if c['field_name'] == idx_col: + index_col_names[idx_col] = \ + c['name'] or c['field_name'] + if c['name'] is None: + reset_index_name = True actual_index_names = None if index_col is not None and len(index_col) > 0: @@ -473,41 +317,6 @@ cdef int64_t get_num_rows_arg(object arg) except*: return arg -cdef orc_reader_options make_orc_reader_options( - object filepaths_or_buffers, - object column_names, - object stripes, - int64_t skip_rows, - int64_t num_rows, - type_id timestamp_type, - bool use_index -) except*: - - cdef vector[vector[size_type]] strps = stripes - cdef orc_reader_options opts - cdef source_info src = make_source_info(filepaths_or_buffers) - opts = move( - orc_reader_options.builder(src) - .stripes(strps) - .skip_rows(skip_rows) - .timestamp_type(data_type(timestamp_type)) - .use_index(use_index) - .build() - ) - if num_rows >= 0: - opts.set_num_rows(num_rows) - - cdef vector[string] c_column_names - if column_names is not None: - c_column_names.reserve(len(column_names)) - for col in column_names: - c_column_names.push_back(str(col).encode()) - if len(column_names) > 0: - opts.set_columns(c_column_names) - - return opts - - cdef class ORCWriter: """ ORCWriter lets you you incrementally write out a ORC file from a series diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx index d94f0e1b564..13997da8403 100644 --- a/python/cudf/cudf/_lib/partitioning.pyx +++ b/python/cudf/cudf/_lib/partitioning.pyx @@ -2,24 +2,13 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move -from libcpp.vector cimport vector - -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.partitioning cimport partition as cpp_partition -from pylibcudf.libcudf.table.table cimport table -from pylibcudf.libcudf.table.table_view cimport table_view - from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns + +import pylibcudf as plc from cudf._lib.reduce import minmax from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count -cimport pylibcudf.libcudf.types as libcudf_types - @acquire_spill_lock() def partition(list source_columns, Column partition_map, @@ -50,25 +39,15 @@ def partition(list source_columns, Column partition_map, if num_partitions is None: num_partitions = cpp_distinct_count(partition_map, ignore_nulls=True) - cdef int c_num_partitions = num_partitions - cdef table_view c_source_view = table_view_from_columns(source_columns) - - cdef column_view c_partition_map_view = partition_map.view() - cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result if partition_map.size > 0: lo, hi = minmax(partition_map) if lo < 0 or hi >= num_partitions: raise ValueError("Partition map has invalid values") - with nogil: - c_result = move( - cpp_partition( - c_source_view, - c_partition_map_view, - c_num_partitions - ) - ) - return ( - columns_from_unique_ptr(move(c_result.first)), list(c_result.second) + plc_table, offsets = plc.partitioning.partition( + plc.Table([col.to_pylibcudf(mode="read") for col in source_columns]), + partition_map.to_pylibcudf(mode="read"), + num_partitions ) + return [Column.from_pylibcudf(col) for col in plc_table.columns()], offsets diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx index fe8c17c9e31..df81b3942b4 100644 --- a/python/cudf/cudf/_lib/strings/attributes.pyx +++ b/python/cudf/cudf/_lib/strings/attributes.pyx @@ -2,19 +2,10 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.column.column_view cimport column_view -from pylibcudf.libcudf.strings.attributes cimport ( - code_points as cpp_code_points, - count_bytes as cpp_count_bytes, - count_characters as cpp_count_characters, -) - from cudf._lib.column cimport Column +import pylibcudf as plc + @acquire_spill_lock() def count_characters(Column source_strings): @@ -22,13 +13,10 @@ def count_characters(Column source_strings): Returns an integer numeric column containing the length of each string in characters. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_count_characters(source_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.attributes.count_characters( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -37,13 +25,10 @@ def count_bytes(Column source_strings): Returns an integer numeric column containing the number of bytes of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_count_bytes(source_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.attributes.count_bytes( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) @acquire_spill_lock() @@ -52,10 +37,7 @@ def code_points(Column source_strings): Creates a numeric column with code point values (integers) for each character of each string. """ - cdef unique_ptr[column] c_result - cdef column_view source_view = source_strings.view() - - with nogil: - c_result = move(cpp_code_points(source_view)) - - return Column.from_unique_ptr(move(c_result)) + plc_column = plc.strings.attributes.code_points( + source_strings.to_pylibcudf(mode="read") + ) + return Column.from_pylibcudf(plc_column) diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx index bff3b2c4ce4..54624a5a2fd 100644 --- a/python/cudf/cudf/_lib/timezone.pyx +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -1,29 +1,10 @@ # Copyright (c) 2023-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.optional cimport make_optional -from libcpp.string cimport string -from libcpp.utility cimport move +import pylibcudf as plc -from pylibcudf.libcudf.io.timezone cimport ( - make_timezone_transition_table as cpp_make_timezone_transition_table, -) -from pylibcudf.libcudf.table.table cimport table - -from cudf._lib.utils cimport columns_from_unique_ptr +from cudf._lib.column cimport Column def make_timezone_transition_table(tzdir, tzname): - cdef unique_ptr[table] c_result - cdef string c_tzdir = tzdir.encode() - cdef string c_tzname = tzname.encode() - - with nogil: - c_result = move( - cpp_make_timezone_transition_table( - make_optional[string](c_tzdir), - c_tzname - ) - ) - - return columns_from_unique_ptr(move(c_result)) + plc_table = plc.io.timezone.make_timezone_transition_table(tzdir, tzname) + return [Column.from_pylibcudf(col) for col in plc_table.columns()] diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx index f78fbd4c844..995d278cb88 100644 --- a/python/cudf/cudf/_lib/transpose.pyx +++ b/python/cudf/cudf/_lib/transpose.pyx @@ -1,32 +1,18 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -from libcpp.memory cimport unique_ptr -from libcpp.pair cimport pair -from libcpp.utility cimport move - -from pylibcudf.libcudf.column.column cimport column -from pylibcudf.libcudf.table.table_view cimport table_view -from pylibcudf.libcudf.transpose cimport transpose as cpp_transpose +import pylibcudf as plc from cudf._lib.column cimport Column -from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns def transpose(list source_columns): """Transpose m n-row columns into n m-row columns """ - cdef pair[unique_ptr[column], table_view] c_result - cdef table_view c_input = table_view_from_columns(source_columns) - - with nogil: - c_result = move(cpp_transpose(c_input)) - - # Notice, the data pointer of `result_owner` has been exposed - # through `c_result.second` at this point. - result_owner = Column.from_unique_ptr( - move(c_result.first), data_ptr_exposed=True - ) - return columns_from_table_view( - c_result.second, - owners=[result_owner] * c_result.second.num_columns() + input_table = plc.table.Table( + [col.to_pylibcudf(mode="read") for col in source_columns] ) + result_table = plc.transpose.transpose(input_table) + return [ + Column.from_pylibcudf(col, data_ptr_exposed=True) + for col in result_table.columns() + ] diff --git a/python/cudf/cudf/_lib/utils.pxd b/python/cudf/cudf/_lib/utils.pxd index ff97fe80310..7254db5c43d 100644 --- a/python/cudf/cudf/_lib/utils.pxd +++ b/python/cudf/cudf/_lib/utils.pxd @@ -11,7 +11,7 @@ from pylibcudf.libcudf.table.table cimport table, table_view cdef data_from_unique_ptr( unique_ptr[table] c_tbl, column_names, index_names=*) cdef data_from_pylibcudf_table(tbl, column_names, index_names=*) -cdef data_from_pylibcudf_io(tbl_with_meta) +cdef data_from_pylibcudf_io(tbl_with_meta, column_names = *, index_names = *) cdef data_from_table_view( table_view tv, object owner, object column_names, object index_names=*) cdef table_view table_view_from_columns(columns) except * diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx index 8660cca9322..9e5b99f64eb 100644 --- a/python/cudf/cudf/_lib/utils.pyx +++ b/python/cudf/cudf/_lib/utils.pyx @@ -316,15 +316,17 @@ cdef data_from_pylibcudf_table(tbl, column_names, index_names=None): index_names ) -cdef data_from_pylibcudf_io(tbl_with_meta): +cdef data_from_pylibcudf_io(tbl_with_meta, column_names=None, index_names=None): """ Unpacks the TableWithMetadata from libcudf I/O into a dict of columns and an Index (cuDF format) """ + if column_names is None: + column_names = tbl_with_meta.column_names(include_children=False) return _data_from_columns( columns=[Column.from_pylibcudf(plc) for plc in tbl_with_meta.columns], - column_names=tbl_with_meta.column_names(include_children=False), - index_names=None + column_names=column_names, + index_names=index_names ) cdef columns_from_table_view( diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py index 98af006f6e5..37ad6b8fabb 100644 --- a/python/cudf/cudf/core/frame.py +++ b/python/cudf/cudf/core/frame.py @@ -767,11 +767,15 @@ def fillna( ) @_performance_tracking - def _drop_column(self, name): - """Drop a column by *name*""" - if name not in self._data: - raise KeyError(f"column '{name}' does not exist") - del self._data[name] + def _drop_column( + self, name: abc.Hashable, errors: Literal["ignore", "raise"] = "raise" + ) -> None: + """Drop a column by *name* inplace.""" + try: + del self._data[name] + except KeyError as err: + if errors != "ignore": + raise KeyError(f"column '{name}' does not exist") from err @_performance_tracking def _quantile_table( diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 810d4ad74e7..5952815deef 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -3,7 +3,6 @@ from __future__ import annotations -import numbers import operator import textwrap import warnings @@ -150,24 +149,14 @@ ) -def _get_host_unique(array): +def _get_unique_drop_labels(array): + """Return labels to be dropped for IndexFrame.drop.""" if isinstance(array, (cudf.Series, cudf.Index, ColumnBase)): - return array.unique.to_pandas() - elif isinstance(array, (str, numbers.Number)): - return [array] + yield from np.unique(as_column(array).values_host) + elif is_scalar(array): + yield array else: - return set(array) - - -def _drop_columns(f: Frame, columns: abc.Iterable, errors: str): - for c in columns: - try: - f._drop_column(c) - except KeyError as e: - if errors == "ignore": - pass - else: - raise e + yield from set(array) def _indices_from_labels(obj, labels): @@ -5262,15 +5251,14 @@ def drop( out = self.copy() if axis in (1, "columns"): - target = _get_host_unique(target) - - _drop_columns(out, target, errors) + for label in _get_unique_drop_labels(target): + out._drop_column(label, errors=errors) elif axis in (0, "index"): dropped = _drop_rows_by_labels(out, target, level, errors) if columns is not None: - columns = _get_host_unique(columns) - _drop_columns(dropped, columns, errors) + for label in _get_unique_drop_labels(columns): + dropped._drop_column(label, errors=errors) out._mimic_inplace(dropped, inplace=True) diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py index c54293badbe..68b60809bb9 100644 --- a/python/cudf/cudf/io/orc.py +++ b/python/cudf/cudf/io/orc.py @@ -181,11 +181,6 @@ def read_orc_statistics( parsed_stripes_statistics, ) = liborc.read_parsed_orc_statistics(path_or_buf) - # Parse column names - column_names = [ - column_name.decode("utf-8") for column_name in column_names - ] - # Parse file statistics file_statistics = { column_name: column_stats @@ -248,9 +243,9 @@ def _filter_stripes( num_rows_scanned = 0 for i, stripe_statistics in enumerate(stripes_statistics): num_rows_before_stripe = num_rows_scanned - num_rows_scanned += next(iter(stripe_statistics.values()))[ - "number_of_values" - ] + num_rows_scanned += next( + iter(stripe_statistics.values()) + ).number_of_values if stripes is not None and i not in stripes: continue if skip_rows is not None and num_rows_scanned <= skip_rows: diff --git a/python/cudf/cudf/tests/test_array_function.py b/python/cudf/cudf/tests/test_array_function.py index 773141ee71a..979c936a182 100644 --- a/python/cudf/cudf/tests/test_array_function.py +++ b/python/cudf/cudf/tests/test_array_function.py @@ -33,9 +33,10 @@ def __array_function__(self, *args, **kwargs): missing_arrfunc_reason = "NEP-18 support is not available in NumPy" +np.random.seed(0) + @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize("np_ar", [np.random.random(100)]) @pytest.mark.parametrize( "func", [ @@ -47,7 +48,8 @@ def __array_function__(self, *args, **kwargs): lambda x: np.linalg.norm(x), ], ) -def test_array_func_cudf_series(np_ar, func): +def test_array_func_cudf_series(func): + np_ar = np.random.random(100) cudf_ser = cudf.Series(np_ar) expect = func(np_ar) got = func(cudf_ser) @@ -58,9 +60,6 @@ def test_array_func_cudf_series(np_ar, func): @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))] -) @pytest.mark.parametrize( "func", [ @@ -74,7 +73,8 @@ def test_array_func_cudf_series(np_ar, func): lambda x: np.prod(x, axis=1), ], ) -def test_array_func_cudf_dataframe(pd_df, func): +def test_array_func_cudf_dataframe(func): + pd_df = pd.DataFrame(np.random.uniform(size=(100, 10))) cudf_df = cudf.from_pandas(pd_df) expect = func(pd_df) got = func(cudf_df) @@ -82,9 +82,6 @@ def test_array_func_cudf_dataframe(pd_df, func): @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize( - "pd_df", [pd.DataFrame(np.random.uniform(size=(100, 10)))] -) @pytest.mark.parametrize( "func", [ @@ -93,21 +90,22 @@ def test_array_func_cudf_dataframe(pd_df, func): lambda x: np.linalg.det(x), ], ) -def test_array_func_missing_cudf_dataframe(pd_df, func): +def test_array_func_missing_cudf_dataframe(func): + pd_df = pd.DataFrame(np.random.uniform(size=(100, 10))) cudf_df = cudf.from_pandas(pd_df) with pytest.raises(TypeError): func(cudf_df) @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize("np_ar", [np.random.random(100)]) @pytest.mark.parametrize( "func", [ lambda x: np.unique(x), ], ) -def test_array_func_cudf_index(np_ar, func): +def test_array_func_cudf_index(func): + np_ar = np.random.random(100) cudf_index = cudf.Index(cudf.Series(np_ar)) expect = func(np_ar) got = func(cudf_index) @@ -118,7 +116,6 @@ def test_array_func_cudf_index(np_ar, func): @pytest.mark.skipif(missing_arrfunc_cond, reason=missing_arrfunc_reason) -@pytest.mark.parametrize("np_ar", [np.random.random(100)]) @pytest.mark.parametrize( "func", [ @@ -127,7 +124,8 @@ def test_array_func_cudf_index(np_ar, func): lambda x: np.linalg.det(x), ], ) -def test_array_func_missing_cudf_index(np_ar, func): +def test_array_func_missing_cudf_index(func): + np_ar = np.random.random(100) cudf_index = cudf.Index(cudf.Series(np_ar)) with pytest.raises(TypeError): func(cudf_index) diff --git a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py index 9d69e626c3d..5acdf36de80 100644 --- a/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py +++ b/python/cudf/cudf/tests/test_avro_reader_fastavro_integration.py @@ -236,6 +236,7 @@ def test_avro_compression(rows, codec): }, ], rows, + seed=0, ) expected_df = cudf.DataFrame.from_arrow(df) @@ -599,7 +600,7 @@ def test_avro_reader_multiblock( else: assert dtype in ("float32", "float64") avro_type = "float" if dtype == "float32" else "double" - + np.random.seed(0) # We don't use rand_dataframe() here, because it increases the # execution time of each test by a factor of 10 or more (it appears # to use a very costly approach to generating random data). diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index f4d1578bda7..6f88d942746 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -515,6 +515,17 @@ def test_dataframe_drop_columns(pdf, columns, inplace): assert_eq(expected, actual) +@pytest.mark.parametrize("obj", ["Index", "Series"]) +def test_drop_cudf_obj_columns(obj): + pdf = pd.DataFrame({"A": [1], "B": [1]}) + gdf = cudf.from_pandas(pdf) + + columns = ["B"] + expected = pdf.drop(labels=getattr(pd, obj)(columns), axis=1) + actual = gdf.drop(columns=getattr(cudf, obj)(columns), axis=1) + assert_eq(expected, actual) + + @pytest.mark.parametrize( "pdf", [ diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py index 0aaa71e50d7..848bc259e7b 100644 --- a/python/cudf/cudf/tests/test_groupby.py +++ b/python/cudf/cudf/tests/test_groupby.py @@ -2470,6 +2470,7 @@ def test_groupby_2keys_rank(nelem, method, ascending, na_option, pct): ], rows=nelem, use_threads=False, + seed=0, ) pdf = t.to_pandas() pdf.columns = ["x", "y", "z"] @@ -2602,6 +2603,7 @@ def test_groupby_shift_row_mixed_numerics( ], rows=nelem, use_threads=False, + seed=0, ) pdf = t.to_pandas() gdf = cudf.from_pandas(pdf) @@ -2639,6 +2641,7 @@ def test_groupby_shift_row_mixed(nelem, shift_perc, direction): ], rows=nelem, use_threads=False, + seed=0, ) pdf = t.to_pandas() gdf = cudf.from_pandas(pdf) @@ -2687,6 +2690,7 @@ def test_groupby_shift_row_mixed_fill( ], rows=nelem, use_threads=False, + seed=0, ) pdf = t.to_pandas() gdf = cudf.from_pandas(pdf) @@ -2732,6 +2736,7 @@ def test_groupby_shift_row_zero_shift(nelem, fill_value): ], rows=nelem, use_threads=False, + seed=0, ) gdf = cudf.from_pandas(t.to_pandas()) @@ -2782,6 +2787,7 @@ def test_groupby_diff_row_mixed_numerics(nelem, shift_perc, direction): ], rows=nelem, use_threads=False, + seed=0, ) pdf = t.to_pandas() gdf = cudf.from_pandas(pdf) @@ -2815,6 +2821,7 @@ def test_groupby_diff_row_zero_shift(nelem): ], rows=nelem, use_threads=False, + seed=0, ) gdf = cudf.from_pandas(t.to_pandas()) diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py index c2a30b76bea..1dd732c7191 100644 --- a/python/cudf/cudf/tests/test_orc.py +++ b/python/cudf/cudf/tests/test_orc.py @@ -184,25 +184,25 @@ def test_orc_read_statistics(datadir): pytest.skip(".orc file is not found: %s" % e) # Check numberOfValues - assert_eq(file_statistics[0]["int1"]["number_of_values"], 11_000) + assert_eq(file_statistics[0]["int1"].number_of_values, 11_000) assert_eq( - file_statistics[0]["int1"]["number_of_values"], + file_statistics[0]["int1"].number_of_values, sum( [ - stripes_statistics[0]["int1"]["number_of_values"], - stripes_statistics[1]["int1"]["number_of_values"], - stripes_statistics[2]["int1"]["number_of_values"], + stripes_statistics[0]["int1"].number_of_values, + stripes_statistics[1]["int1"].number_of_values, + stripes_statistics[2]["int1"].number_of_values, ] ), ) assert_eq( - stripes_statistics[1]["int1"]["number_of_values"], - stripes_statistics[1]["string1"]["number_of_values"], + stripes_statistics[1]["int1"].number_of_values, + stripes_statistics[1]["string1"].number_of_values, ) - assert_eq(stripes_statistics[2]["string1"]["number_of_values"], 1_000) + assert_eq(stripes_statistics[2]["string1"].number_of_values, 1_000) # Check other statistics - assert_eq(stripes_statistics[2]["string1"]["has_null"], False) + assert_eq(stripes_statistics[2]["string1"].has_null, False) assert_eq( file_statistics[0]["int1"]["minimum"], min( @@ -1538,8 +1538,8 @@ def test_empty_statistics(): for stats in got: # Similar expected stats for the first 6 columns in this case for col_name in ascii_lowercase[:6]: - assert stats[0][col_name].get("number_of_values") == 0 - assert stats[0][col_name].get("has_null") is True + assert stats[0][col_name].number_of_values == 0 + assert stats[0][col_name].has_null is True assert stats[0][col_name].get("minimum") is None assert stats[0][col_name].get("maximum") is None for col_name in ascii_lowercase[:3]: @@ -1547,17 +1547,17 @@ def test_empty_statistics(): # Sum for decimal column is a string assert stats[0]["d"].get("sum") == "0" - assert stats[0]["g"].get("number_of_values") == 0 - assert stats[0]["g"].get("has_null") is True + assert stats[0]["g"].number_of_values == 0 + assert stats[0]["g"].has_null is True assert stats[0]["g"].get("true_count") == 0 assert stats[0]["g"].get("false_count") == 0 - assert stats[0]["h"].get("number_of_values") == 0 - assert stats[0]["h"].get("has_null") is True + assert stats[0]["h"].number_of_values == 0 + assert stats[0]["h"].has_null is True assert stats[0]["h"].get("sum") == 0 - assert stats[0]["i"].get("number_of_values") == 1 - assert stats[0]["i"].get("has_null") is False + assert stats[0]["i"].number_of_values == 1 + assert stats[0]["i"].has_null is False assert stats[0]["i"].get("minimum") == 1 assert stats[0]["i"].get("maximum") == 1 assert stats[0]["i"].get("sum") == 1 diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py index 1180da321e6..d636f36f282 100644 --- a/python/cudf/cudf/utils/ioutils.py +++ b/python/cudf/cudf/utils/ioutils.py @@ -1873,7 +1873,7 @@ def _apply_filter_bool_eq(val, col_stats): return False elif val is False: if (col_stats["false_count"] == 0) or ( - col_stats["true_count"] == col_stats["number_of_values"] + col_stats["true_count"] == col_stats.number_of_values ): return False return True @@ -1900,7 +1900,7 @@ def _apply_predicate(op, val, col_stats): return False # TODO: Replace pd.isnull with # cudf.isnull once it is implemented - if pd.isnull(val) and not col_stats["has_null"]: + if pd.isnull(val) and not col_stats.has_null: return False if not _apply_filter_bool_eq(val, col_stats): return False diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index 9347ebba5de..bead964a0ef 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -8,6 +8,7 @@ import numpy as np import pandas as pd import pyarrow as pa +from packaging.version import Version from pandas.api.types import is_scalar import dask.dataframe as dd @@ -52,6 +53,10 @@ get_parallel_type.register(cudf.BaseIndex, lambda _: Index) +# Required for Arrow filesystem support in read_parquet +PYARROW_GE_15 = Version(pa.__version__) >= Version("15.0.0") + + @meta_nonempty.register(cudf.BaseIndex) @_dask_cudf_performance_tracking def _nonempty_index(idx): @@ -695,15 +700,140 @@ def from_dict( ) @staticmethod - def read_parquet(*args, engine=None, **kwargs): + def read_parquet(path, *args, filesystem="fsspec", engine=None, **kwargs): import dask_expr as dx + import fsspec - from dask_cudf.io.parquet import CudfEngine + if ( + isinstance(filesystem, fsspec.AbstractFileSystem) + or isinstance(filesystem, str) + and filesystem.lower() == "fsspec" + ): + # Default "fsspec" filesystem + from dask_cudf.io.parquet import CudfEngine - _raise_unsupported_parquet_kwargs(**kwargs) - return _default_backend( - dx.read_parquet, *args, engine=CudfEngine, **kwargs - ) + _raise_unsupported_parquet_kwargs(**kwargs) + return _default_backend( + dx.read_parquet, + path, + *args, + filesystem=filesystem, + engine=CudfEngine, + **kwargs, + ) + + else: + # EXPERIMENTAL filesystem="arrow" support. + # This code path uses PyArrow for IO, which is only + # beneficial for remote storage (e.g. S3) + + from fsspec.utils import stringify_path + from pyarrow import fs as pa_fs + + # CudfReadParquetPyarrowFS requires import of distributed beforehand + # (See: https://github.com/dask/dask/issues/11352) + import distributed # noqa: F401 + from dask.core import flatten + from dask.dataframe.utils import pyarrow_strings_enabled + + from dask_cudf.expr._expr import CudfReadParquetPyarrowFS + + if args: + raise ValueError(f"Unexpected positional arguments: {args}") + + if not ( + isinstance(filesystem, pa_fs.FileSystem) + or isinstance(filesystem, str) + and filesystem.lower() in ("arrow", "pyarrow") + ): + raise ValueError(f"Unexpected filesystem value: {filesystem}.") + + if not PYARROW_GE_15: + raise NotImplementedError( + "Experimental Arrow filesystem support requires pyarrow>=15" + ) + + if not isinstance(path, str): + path = stringify_path(path) + + # Extract kwargs + columns = kwargs.pop("columns", None) + filters = kwargs.pop("filters", None) + categories = kwargs.pop("categories", None) + index = kwargs.pop("index", None) + storage_options = kwargs.pop("storage_options", None) + dtype_backend = kwargs.pop("dtype_backend", None) + calculate_divisions = kwargs.pop("calculate_divisions", False) + ignore_metadata_file = kwargs.pop("ignore_metadata_file", False) + metadata_task_size = kwargs.pop("metadata_task_size", None) + split_row_groups = kwargs.pop("split_row_groups", "infer") + blocksize = kwargs.pop("blocksize", "default") + aggregate_files = kwargs.pop("aggregate_files", None) + parquet_file_extension = kwargs.pop( + "parquet_file_extension", (".parq", ".parquet", ".pq") + ) + arrow_to_pandas = kwargs.pop("arrow_to_pandas", None) + open_file_options = kwargs.pop("open_file_options", None) + + # Validate and normalize kwargs + kwargs["dtype_backend"] = dtype_backend + if arrow_to_pandas is not None: + raise ValueError( + "arrow_to_pandas not supported for the 'cudf' backend." + ) + if open_file_options is not None: + raise ValueError( + "The open_file_options argument is no longer supported " + "by the 'cudf' backend." + ) + if filters is not None: + for filter in flatten(filters, container=list): + _, op, val = filter + if op == "in" and not isinstance(val, (set, list, tuple)): + raise TypeError( + "Value of 'in' filter must be a list, set or tuple." + ) + if metadata_task_size is not None: + raise NotImplementedError( + "metadata_task_size is not supported when using the pyarrow filesystem." + ) + if split_row_groups != "infer": + raise NotImplementedError( + "split_row_groups is not supported when using the pyarrow filesystem." + ) + if parquet_file_extension != (".parq", ".parquet", ".pq"): + raise NotImplementedError( + "parquet_file_extension is not supported when using the pyarrow filesystem." + ) + if blocksize is not None and blocksize != "default": + warnings.warn( + "blocksize is not supported when using the pyarrow filesystem." + "blocksize argument will be ignored." + ) + if aggregate_files is not None: + warnings.warn( + "aggregate_files is not supported when using the pyarrow filesystem. " + "Please use the 'dataframe.parquet.minimum-partition-size' config." + "aggregate_files argument will be ignored." + ) + + return dx.new_collection( + CudfReadParquetPyarrowFS( + path, + columns=dx._util._convert_to_list(columns), + filters=filters, + categories=categories, + index=index, + calculate_divisions=calculate_divisions, + storage_options=storage_options, + filesystem=filesystem, + ignore_metadata_file=ignore_metadata_file, + arrow_to_pandas=arrow_to_pandas, + pyarrow_strings_enabled=pyarrow_strings_enabled(), + kwargs=kwargs, + _series=isinstance(columns, str), + ) + ) @staticmethod def read_csv( diff --git a/python/dask_cudf/dask_cudf/expr/_expr.py b/python/dask_cudf/dask_cudf/expr/_expr.py index b284ab3774d..af83a01da98 100644 --- a/python/dask_cudf/dask_cudf/expr/_expr.py +++ b/python/dask_cudf/dask_cudf/expr/_expr.py @@ -2,10 +2,13 @@ import functools import dask_expr._shuffle as _shuffle_module +import pandas as pd from dask_expr import new_collection from dask_expr._cumulative import CumulativeBlockwise from dask_expr._expr import Elemwise, Expr, RenameAxis, VarColumns from dask_expr._reductions import Reduction, Var +from dask_expr.io.io import FusedParquetIO +from dask_expr.io.parquet import ReadParquetPyarrowFS from dask.dataframe.core import is_dataframe_like, make_meta, meta_nonempty from dask.dataframe.dispatch import is_categorical_dtype @@ -18,6 +21,92 @@ ## +class CudfFusedParquetIO(FusedParquetIO): + @staticmethod + def _load_multiple_files( + frag_filters, + columns, + schema, + *to_pandas_args, + ): + import pyarrow as pa + + from dask.base import apply, tokenize + from dask.threaded import get + + token = tokenize(frag_filters, columns, schema) + name = f"pq-file-{token}" + dsk = { + (name, i): ( + CudfReadParquetPyarrowFS._fragment_to_table, + frag, + filter, + columns, + schema, + ) + for i, (frag, filter) in enumerate(frag_filters) + } + dsk[name] = ( + apply, + pa.concat_tables, + [list(dsk.keys())], + {"promote_options": "permissive"}, + ) + return CudfReadParquetPyarrowFS._table_to_pandas( + get(dsk, name), + *to_pandas_args, + ) + + +class CudfReadParquetPyarrowFS(ReadParquetPyarrowFS): + @functools.cached_property + def _dataset_info(self): + from dask_cudf.io.parquet import set_object_dtypes_from_pa_schema + + dataset_info = super()._dataset_info + meta_pd = dataset_info["base_meta"] + if isinstance(meta_pd, cudf.DataFrame): + return dataset_info + + # Convert to cudf + # (drop unsupported timezone information) + for k, v in meta_pd.dtypes.items(): + if isinstance(v, pd.DatetimeTZDtype) and v.tz is not None: + meta_pd[k] = meta_pd[k].dt.tz_localize(None) + meta_cudf = cudf.from_pandas(meta_pd) + + # Re-set "object" dtypes to align with pa schema + kwargs = dataset_info.get("kwargs", {}) + set_object_dtypes_from_pa_schema( + meta_cudf, + kwargs.get("schema", None), + ) + + dataset_info["base_meta"] = meta_cudf + self.operands[type(self)._parameters.index("_dataset_info_cache")] = ( + dataset_info + ) + return dataset_info + + @staticmethod + def _table_to_pandas( + table, + index_name, + *args, + ): + df = cudf.DataFrame.from_arrow(table) + if index_name is not None: + df = df.set_index(index_name) + return df + + def _tune_up(self, parent): + if self._fusion_compression_factor >= 1: + return + if isinstance(parent, CudfFusedParquetIO): + return + return parent.substitute(self, CudfFusedParquetIO(self)) + + class RenameAxisCudf(RenameAxis): # TODO: Remove this after rename_axis is supported in cudf # (See: https://github.com/rapidsai/cudf/issues/16895) diff --git a/python/dask_cudf/dask_cudf/io/tests/test_s3.py b/python/dask_cudf/dask_cudf/io/tests/test_s3.py index a14ffbc37dc..cf8af82e112 100644 --- a/python/dask_cudf/dask_cudf/io/tests/test_s3.py +++ b/python/dask_cudf/dask_cudf/io/tests/test_s3.py @@ -12,6 +12,7 @@ from dask.dataframe import assert_eq import dask_cudf +from dask_cudf.tests.utils import QUERY_PLANNING_ON moto = pytest.importorskip("moto", minversion="3.1.6") boto3 = pytest.importorskip("boto3") @@ -127,7 +128,20 @@ def test_read_parquet_open_file_options_raises(): ) -def test_read_parquet_filesystem(s3_base, s3so, pdf): +@pytest.mark.parametrize( + "filesystem", + [ + pytest.param( + "arrow", + marks=pytest.mark.skipif( + not QUERY_PLANNING_ON or not dask_cudf.backends.PYARROW_GE_15, + reason="Not supported", + ), + ), + "fsspec", + ], +) +def test_read_parquet_filesystem(s3_base, s3so, pdf, filesystem): fname = "test_parquet_filesystem.parquet" bucket = "parquet" buffer = BytesIO() @@ -135,21 +149,24 @@ def test_read_parquet_filesystem(s3_base, s3so, pdf): buffer.seek(0) with s3_context(s3_base=s3_base, bucket=bucket, files={fname: buffer}): path = f"s3://{bucket}/{fname}" + if filesystem == "arrow": + # This feature requires arrow >= 15 + pytest.importorskip("pyarrow", minversion="15.0.0") - # Cannot pass filesystem="arrow" - with pytest.raises(ValueError): - dask_cudf.read_parquet( + import pyarrow.fs as pa_fs + + df = dask_cudf.read_parquet( + path, + filesystem=pa_fs.S3FileSystem( + endpoint_override=s3so["client_kwargs"]["endpoint_url"], + ), + ) + else: + df = dask_cudf.read_parquet( path, storage_options=s3so, - filesystem="arrow", + filesystem=filesystem, ) - - # Can pass filesystem="fsspec" - df = dask_cudf.read_parquet( - path, - storage_options=s3so, - filesystem="fsspec", - ) assert df.b.sum().compute() == 9 diff --git a/python/dask_cudf/dask_cudf/tests/test_reductions.py b/python/dask_cudf/dask_cudf/tests/test_reductions.py index 88b15718382..d03e92319be 100644 --- a/python/dask_cudf/dask_cudf/tests/test_reductions.py +++ b/python/dask_cudf/dask_cudf/tests/test_reductions.py @@ -13,6 +13,7 @@ def _make_random_frame(nelem, npartitions=2): + np.random.seed(0) df = pd.DataFrame( { "x": np.random.randint(0, 5, size=nelem), @@ -38,7 +39,6 @@ def wrapped(series): @pytest.mark.parametrize("reducer", _reducers) def test_series_reduce(reducer): reducer = _get_reduce_fn(reducer) - np.random.seed(0) size = 10 df, gdf = _make_random_frame(size) diff --git a/python/pylibcudf/pylibcudf/CMakeLists.txt b/python/pylibcudf/pylibcudf/CMakeLists.txt index f07c8897e34..a7cb66d7b16 100644 --- a/python/pylibcudf/pylibcudf/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/CMakeLists.txt @@ -31,6 +31,7 @@ set(cython_sources lists.pyx merge.pyx null_mask.pyx + partitioning.pyx quantiles.pyx reduce.pyx replace.pyx @@ -44,6 +45,7 @@ set(cython_sources table.pyx traits.pyx transform.pyx + transpose.pyx types.pyx unary.pyx utils.pyx diff --git a/python/pylibcudf/pylibcudf/__init__.pxd b/python/pylibcudf/pylibcudf/__init__.pxd index b7cf6413c05..a384edd456d 100644 --- a/python/pylibcudf/pylibcudf/__init__.pxd +++ b/python/pylibcudf/pylibcudf/__init__.pxd @@ -17,6 +17,7 @@ from . cimport ( lists, merge, null_mask, + partitioning, quantiles, reduce, replace, @@ -29,6 +30,7 @@ from . cimport ( strings, traits, transform, + transpose, types, unary, ) @@ -60,6 +62,7 @@ __all__ = [ "lists", "merge", "null_mask", + "partitioning", "quantiles", "reduce", "replace", @@ -72,6 +75,7 @@ __all__ = [ "sorting", "traits", "transform", + "transpose", "types", "unary", ] diff --git a/python/pylibcudf/pylibcudf/__init__.py b/python/pylibcudf/pylibcudf/__init__.py index 84b1c29f791..2a5365e8fad 100644 --- a/python/pylibcudf/pylibcudf/__init__.py +++ b/python/pylibcudf/pylibcudf/__init__.py @@ -28,6 +28,7 @@ lists, merge, null_mask, + partitioning, quantiles, reduce, replace, @@ -40,6 +41,7 @@ strings, traits, transform, + transpose, types, unary, ) @@ -74,6 +76,7 @@ "lists", "merge", "null_mask", + "partitioning", "quantiles", "reduce", "replace", @@ -86,6 +89,7 @@ "sorting", "traits", "transform", + "transpose", "types", "unary", ] diff --git a/python/pylibcudf/pylibcudf/binaryop.pyx b/python/pylibcudf/pylibcudf/binaryop.pyx index 5a67f4d6cdb..5f9d145139a 100644 --- a/python/pylibcudf/pylibcudf/binaryop.pyx +++ b/python/pylibcudf/pylibcudf/binaryop.pyx @@ -94,7 +94,7 @@ cpdef bool is_supported_operation( ): """Check if an operation is supported for the given data types. - For details, see :cpp:func::is_supported_operation`. + For details, see :cpp:func::`is_supported_operation`. Parameters ---------- diff --git a/python/pylibcudf/pylibcudf/column_factories.pyx b/python/pylibcudf/pylibcudf/column_factories.pyx index 4601cba515a..e9085e3ea02 100644 --- a/python/pylibcudf/pylibcudf/column_factories.pyx +++ b/python/pylibcudf/pylibcudf/column_factories.pyx @@ -18,6 +18,20 @@ from .types import MaskState, TypeId cpdef Column make_empty_column(MakeEmptyColumnOperand type_or_id): + """Creates an empty column of the specified type. + + For details, see :cpp:func::`make_empty_column`. + + Parameters + ---------- + type_or_id : Union[DataType, type_id, object] + The column data type. + + Returns + ------- + Column + An empty Column + """ cdef unique_ptr[column] result cdef type_id id @@ -60,7 +74,11 @@ cpdef Column make_numeric_column( size_type size, MaskArg mstate ): + """Creates an empty numeric column. + + For details, see :cpp:func::`make_numeric_column`. + """ cdef unique_ptr[column] result cdef mask_state state diff --git a/python/pylibcudf/pylibcudf/groupby.pyx b/python/pylibcudf/pylibcudf/groupby.pyx index ae5d33aaa46..afb95dba5b3 100644 --- a/python/pylibcudf/pylibcudf/groupby.pyx +++ b/python/pylibcudf/pylibcudf/groupby.pyx @@ -286,7 +286,7 @@ cdef class GroupBy: Returns ------- - Tuple[List[int], Table, Table]] + Tuple[List[int], Table, Table] A tuple of tables containing three items: - A list of integer offsets into the group keys/values - A table of group keys diff --git a/python/pylibcudf/pylibcudf/io/CMakeLists.txt b/python/pylibcudf/pylibcudf/io/CMakeLists.txt index bcc2151f5b6..965724a47b1 100644 --- a/python/pylibcudf/pylibcudf/io/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/io/CMakeLists.txt @@ -12,7 +12,9 @@ # the License. # ============================================================================= -set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx parquet.pyx types.pyx) +set(cython_sources avro.pyx csv.pyx datasource.pyx json.pyx orc.pyx parquet.pyx timezone.pyx + types.pyx +) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/pylibcudf/pylibcudf/io/__init__.pxd b/python/pylibcudf/pylibcudf/io/__init__.pxd index 62820048584..1bcc0a3f963 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.pxd +++ b/python/pylibcudf/pylibcudf/io/__init__.pxd @@ -1,5 +1,5 @@ # Copyright (c) 2024, NVIDIA CORPORATION. # CSV is removed since it is def not cpdef (to force kw-only arguments) -from . cimport avro, datasource, json, parquet, types +from . cimport avro, datasource, json, orc, parquet, timezone, types from .types cimport SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/__init__.py b/python/pylibcudf/pylibcudf/io/__init__.py index 27640f7d955..2e4f215b12c 100644 --- a/python/pylibcudf/pylibcudf/io/__init__.py +++ b/python/pylibcudf/pylibcudf/io/__init__.py @@ -1,4 +1,4 @@ # Copyright (c) 2024, NVIDIA CORPORATION. -from . import avro, csv, datasource, json, parquet, types +from . import avro, csv, datasource, json, orc, parquet, timezone, types from .types import SinkInfo, SourceInfo, TableWithMetadata diff --git a/python/pylibcudf/pylibcudf/io/avro.pyx b/python/pylibcudf/pylibcudf/io/avro.pyx index 667c67f4c36..438b0ff1634 100644 --- a/python/pylibcudf/pylibcudf/io/avro.pyx +++ b/python/pylibcudf/pylibcudf/io/avro.pyx @@ -20,6 +20,8 @@ cpdef TableWithMetadata read_avro( """ Reads an Avro dataset into a :py:class:`~.types.TableWithMetadata`. + For details, see :cpp:func:`read_avro`. + Parameters ---------- source_info: SourceInfo diff --git a/python/pylibcudf/pylibcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/io/orc.pxd new file mode 100644 index 00000000000..b111d617b1b --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/orc.pxd @@ -0,0 +1,50 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libc.stdint cimport uint64_t +from libcpp cimport bool +from libcpp.optional cimport optional +from libcpp.string cimport string +from libcpp.vector cimport vector +from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.libcudf.io.orc_metadata cimport ( + column_statistics, + parsed_orc_statistics, + statistics_type, +) +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.types cimport DataType + + +cpdef TableWithMetadata read_orc( + SourceInfo source_info, + list columns = *, + list stripes = *, + size_type skip_rows = *, + size_type nrows = *, + bool use_index = *, + bool use_np_dtypes = *, + DataType timestamp_type = *, + list decimal128_columns = * +) + +cdef class OrcColumnStatistics: + cdef optional[uint64_t] number_of_values_c + cdef optional[bool] has_null_c + cdef statistics_type type_specific_stats_c + cdef dict column_stats + + cdef void _init_stats_dict(self) + + @staticmethod + cdef OrcColumnStatistics from_libcudf(column_statistics& col_stats) + + +cdef class ParsedOrcStatistics: + cdef parsed_orc_statistics c_obj + + @staticmethod + cdef ParsedOrcStatistics from_libcudf(parsed_orc_statistics& orc_stats) + + +cpdef ParsedOrcStatistics read_parsed_orc_statistics( + SourceInfo source_info +) diff --git a/python/pylibcudf/pylibcudf/io/orc.pyx b/python/pylibcudf/pylibcudf/io/orc.pyx new file mode 100644 index 00000000000..01a5e4b04a1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/orc.pyx @@ -0,0 +1,302 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp cimport bool +from libcpp.string cimport string +from libcpp.utility cimport move +from libcpp.vector cimport vector + +import datetime + +from pylibcudf.io.types cimport SourceInfo, TableWithMetadata +from pylibcudf.libcudf.io.orc cimport ( + orc_reader_options, + read_orc as cpp_read_orc, +) +from pylibcudf.libcudf.io.orc_metadata cimport ( + binary_statistics, + bucket_statistics, + column_statistics, + date_statistics, + decimal_statistics, + double_statistics, + integer_statistics, + no_statistics, + read_parsed_orc_statistics as cpp_read_parsed_orc_statistics, + statistics_type, + string_statistics, + timestamp_statistics, +) +from pylibcudf.libcudf.io.types cimport table_with_metadata +from pylibcudf.libcudf.types cimport size_type +from pylibcudf.types cimport DataType +from pylibcudf.variant cimport get_if, holds_alternative + + +cdef class OrcColumnStatistics: + def __init__(self): + raise TypeError( + "OrcColumnStatistics should not be instantiated by users. If it is " + "being constructed in Cython from a preexisting libcudf object, " + "use `OrcColumnStatistics.from_libcudf` instead." + ) + + @property + def number_of_values(self): + if self.number_of_values_c.has_value(): + return self.number_of_values_c.value() + return None + + @property + def has_null(self): + if self.has_null_c.has_value(): + return self.has_null_c.value() + return None + + cdef void _init_stats_dict(self): + # Initialize stats to return and parse stats blob + self.column_stats = {} + + cdef statistics_type type_specific_stats = self.type_specific_stats_c + + cdef integer_statistics* int_stats + cdef double_statistics* dbl_stats + cdef string_statistics* str_stats + cdef bucket_statistics* bucket_stats + cdef decimal_statistics* dec_stats + cdef date_statistics* date_stats + cdef binary_statistics* bin_stats + cdef timestamp_statistics* ts_stats + + if holds_alternative[no_statistics](type_specific_stats): + pass + elif int_stats := get_if[integer_statistics](&type_specific_stats): + if int_stats.minimum.has_value(): + self.column_stats["minimum"] = int_stats.minimum.value() + else: + self.column_stats["minimum"] = None + if int_stats.maximum.has_value(): + self.column_stats["maximum"] = int_stats.maximum.value() + else: + self.column_stats["maximum"] = None + if int_stats.sum.has_value(): + self.column_stats["sum"] = int_stats.sum.value() + else: + self.column_stats["sum"] = None + elif dbl_stats := get_if[double_statistics](&type_specific_stats): + if dbl_stats.minimum.has_value(): + self.column_stats["minimum"] = dbl_stats.minimum.value() + else: + self.column_stats["minimum"] = None + if dbl_stats.maximum.has_value(): + self.column_stats["maximum"] = dbl_stats.maximum.value() + else: + self.column_stats["maximum"] = None + if dbl_stats.sum.has_value(): + self.column_stats["sum"] = dbl_stats.sum.value() + else: + self.column_stats["sum"] = None + elif str_stats := get_if[string_statistics](&type_specific_stats): + if str_stats.minimum.has_value(): + self.column_stats["minimum"] = str_stats.minimum.value().decode("utf-8") + else: + self.column_stats["minimum"] = None + if str_stats.maximum.has_value(): + self.column_stats["maximum"] = str_stats.maximum.value().decode("utf-8") + else: + self.column_stats["maximum"] = None + if str_stats.sum.has_value(): + self.column_stats["sum"] = str_stats.sum.value() + else: + self.column_stats["sum"] = None + elif bucket_stats := get_if[bucket_statistics](&type_specific_stats): + self.column_stats["true_count"] = bucket_stats.count[0] + self.column_stats["false_count"] = ( + self.number_of_values + - self.column_stats["true_count"] + ) + elif dec_stats := get_if[decimal_statistics](&type_specific_stats): + if dec_stats.minimum.has_value(): + self.column_stats["minimum"] = dec_stats.minimum.value().decode("utf-8") + else: + self.column_stats["minimum"] = None + if dec_stats.maximum.has_value(): + self.column_stats["maximum"] = dec_stats.maximum.value().decode("utf-8") + else: + self.column_stats["maximum"] = None + if dec_stats.sum.has_value(): + self.column_stats["sum"] = dec_stats.sum.value().decode("utf-8") + else: + self.column_stats["sum"] = None + elif date_stats := get_if[date_statistics](&type_specific_stats): + if date_stats.minimum.has_value(): + self.column_stats["minimum"] = datetime.datetime.fromtimestamp( + datetime.timedelta(date_stats.minimum.value()).total_seconds(), + datetime.timezone.utc, + ) + else: + self.column_stats["minimum"] = None + if date_stats.maximum.has_value(): + self.column_stats["maximum"] = datetime.datetime.fromtimestamp( + datetime.timedelta(date_stats.maximum.value()).total_seconds(), + datetime.timezone.utc, + ) + else: + self.column_stats["maximum"] = None + elif bin_stats := get_if[binary_statistics](&type_specific_stats): + if bin_stats.sum.has_value(): + self.column_stats["sum"] = bin_stats.sum.value() + else: + self.column_stats["sum"] = None + elif ts_stats := get_if[timestamp_statistics](&type_specific_stats): + # Before ORC-135, the local timezone offset was included and they were + # stored as minimum and maximum. After ORC-135, the timestamp is + # adjusted to UTC before being converted to milliseconds and stored + # in minimumUtc and maximumUtc. + # TODO: Support minimum and maximum by reading writer's local timezone + if ts_stats.minimum_utc.has_value() and ts_stats.maximum_utc.has_value(): + self.column_stats["minimum"] = datetime.datetime.fromtimestamp( + ts_stats.minimum_utc.value() / 1000, datetime.timezone.utc + ) + self.column_stats["maximum"] = datetime.datetime.fromtimestamp( + ts_stats.maximum_utc.value() / 1000, datetime.timezone.utc + ) + else: + raise ValueError("Unsupported statistics type") + + def __getitem__(self, item): + return self.column_stats[item] + + def __contains__(self, item): + return item in self.column_stats + + def get(self, item, default=None): + return self.column_stats.get(item, default) + + @staticmethod + cdef OrcColumnStatistics from_libcudf(column_statistics& col_stats): + cdef OrcColumnStatistics out = OrcColumnStatistics.__new__(OrcColumnStatistics) + out.number_of_values_c = col_stats.number_of_values + out.has_null_c = col_stats.has_null + out.type_specific_stats_c = col_stats.type_specific_stats + out._init_stats_dict() + return out + + +cdef class ParsedOrcStatistics: + + @property + def column_names(self): + return [name.decode() for name in self.c_obj.column_names] + + @property + def file_stats(self): + return [ + OrcColumnStatistics.from_libcudf(self.c_obj.file_stats[i]) + for i in range(self.c_obj.file_stats.size()) + ] + + @property + def stripes_stats(self): + return [ + [ + OrcColumnStatistics.from_libcudf(stripe_stats_c[i]) + for i in range(stripe_stats_c.size()) + ] + for stripe_stats_c in self.c_obj.stripes_stats + ] + + @staticmethod + cdef ParsedOrcStatistics from_libcudf(parsed_orc_statistics& orc_stats): + cdef ParsedOrcStatistics out = ParsedOrcStatistics.__new__(ParsedOrcStatistics) + out.c_obj = move(orc_stats) + return out + + +cpdef TableWithMetadata read_orc( + SourceInfo source_info, + list columns = None, + list stripes = None, + size_type skip_rows = 0, + size_type nrows = -1, + bool use_index = True, + bool use_np_dtypes = True, + DataType timestamp_type = None, + list decimal128_columns = None, +): + """Reads an ORC file into a :py:class:`~.types.TableWithMetadata`. + + Parameters + ---------- + source_info : SourceInfo + The SourceInfo object to read the Parquet file from. + columns : list, default None + The string names of the columns to be read. + stripes : list[list[size_type]], default None + List of stripes to be read. + skip_rows : int64_t, default 0 + The number of rows to skip from the start of the file. + nrows : size_type, default -1 + The number of rows to read. By default, read the entire file. + use_index : bool, default True + Whether to use the row index to speed up reading. + use_np_dtypes : bool, default True + Whether to use numpy compatible dtypes. + timestamp_type : DataType, default None + The timestamp type to use for the timestamp columns. + decimal128_columns : list, default None + List of column names to be read as 128-bit decimals. + + Returns + ------- + TableWithMetadata + The Table and its corresponding metadata (column names) that were read in. + """ + cdef orc_reader_options opts + cdef vector[vector[size_type]] c_stripes + opts = move( + orc_reader_options.builder(source_info.c_obj) + .use_index(use_index) + .build() + ) + if nrows >= 0: + opts.set_num_rows(nrows) + if skip_rows >= 0: + opts.set_skip_rows(skip_rows) + if stripes is not None: + c_stripes = stripes + opts.set_stripes(c_stripes) + if timestamp_type is not None: + opts.set_timestamp_type(timestamp_type.c_obj) + + cdef vector[string] c_decimal128_columns + if decimal128_columns is not None and len(decimal128_columns) > 0: + c_decimal128_columns.reserve(len(decimal128_columns)) + for col in decimal128_columns: + if not isinstance(col, str): + raise TypeError("Decimal 128 column names must be strings!") + c_decimal128_columns.push_back(col.encode()) + opts.set_decimal128_columns(c_decimal128_columns) + + cdef vector[string] c_column_names + if columns is not None and len(columns) > 0: + c_column_names.reserve(len(columns)) + for col in columns: + if not isinstance(col, str): + raise TypeError("Column names must be strings!") + c_column_names.push_back(col.encode()) + opts.set_columns(c_column_names) + + cdef table_with_metadata c_result + + with nogil: + c_result = move(cpp_read_orc(opts)) + + return TableWithMetadata.from_libcudf(c_result) + + +cpdef ParsedOrcStatistics read_parsed_orc_statistics( + SourceInfo source_info +): + cdef parsed_orc_statistics parsed = ( + cpp_read_parsed_orc_statistics(source_info.c_obj) + ) + return ParsedOrcStatistics.from_libcudf(parsed) diff --git a/python/pylibcudf/pylibcudf/io/parquet.pyx b/python/pylibcudf/pylibcudf/io/parquet.pyx index df1f1b14247..981ca7b8159 100644 --- a/python/pylibcudf/pylibcudf/io/parquet.pyx +++ b/python/pylibcudf/pylibcudf/io/parquet.pyx @@ -59,6 +59,8 @@ cdef class ChunkedParquetReader: """ Reads chunks of a Parquet file into a :py:class:`~.types.TableWithMetadata`. + For details, see :cpp:class:`chunked_parquet_reader`. + Parameters ---------- source_info : SourceInfo @@ -167,6 +169,8 @@ cpdef read_parquet( ): """Reads an Parquet file into a :py:class:`~.types.TableWithMetadata`. + For details, see :cpp:func:`read_parquet`. + Parameters ---------- source_info : SourceInfo diff --git a/python/pylibcudf/pylibcudf/io/timezone.pxd b/python/pylibcudf/pylibcudf/io/timezone.pxd new file mode 100644 index 00000000000..2aa755dbbd8 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/timezone.pxd @@ -0,0 +1,6 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from ..table cimport Table + + +cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name) diff --git a/python/pylibcudf/pylibcudf/io/timezone.pyx b/python/pylibcudf/pylibcudf/io/timezone.pyx new file mode 100644 index 00000000000..e02239d7252 --- /dev/null +++ b/python/pylibcudf/pylibcudf/io/timezone.pyx @@ -0,0 +1,43 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.optional cimport make_optional +from libcpp.string cimport string +from libcpp.utility cimport move +from pylibcudf.libcudf.io.timezone cimport ( + make_timezone_transition_table as cpp_make_timezone_transition_table, +) +from pylibcudf.libcudf.table.table cimport table + +from ..table cimport Table + + +cpdef Table make_timezone_transition_table(str tzif_dir, str timezone_name): + """ + Creates a transition table to convert ORC timestamps to UTC. + + Parameters + ---------- + tzif_dir : str + The directory where the TZif files are located + timezone_name : str + standard timezone name + + Returns + ------- + Table + The transition table for the given timezone. + """ + cdef unique_ptr[table] c_result + cdef string c_tzdir = tzif_dir.encode() + cdef string c_tzname = timezone_name.encode() + + with nogil: + c_result = move( + cpp_make_timezone_transition_table( + make_optional[string](c_tzdir), + c_tzname + ) + ) + + return Table.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/io/types.pyx b/python/pylibcudf/pylibcudf/io/types.pyx index 1600a805b37..563a02761da 100644 --- a/python/pylibcudf/pylibcudf/io/types.pyx +++ b/python/pylibcudf/pylibcudf/io/types.pyx @@ -130,6 +130,7 @@ cdef class TableWithMetadata: """ return self.metadata.per_file_user_data + cdef class SourceInfo: """A class containing details on a source to read from. diff --git a/python/pylibcudf/pylibcudf/labeling.pyx b/python/pylibcudf/pylibcudf/labeling.pyx index b5a7445df36..b3f6a92d85c 100644 --- a/python/pylibcudf/pylibcudf/labeling.pyx +++ b/python/pylibcudf/pylibcudf/labeling.pyx @@ -20,6 +20,8 @@ cpdef Column label_bins( ): """Labels elements based on membership in the specified bins. + For details see :cpp:func:`label_bins`. + Parameters ---------- input : Column diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd index e4a09b8feb2..dca24c7f665 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/orc.pxd @@ -35,6 +35,7 @@ cdef extern from "cudf/io/orc.hpp" \ void enable_use_index(bool val) except + void enable_use_np_dtypes(bool val) except + void set_timestamp_type(data_type type) except + + void set_decimal128_columns(vector[string] val) except + @staticmethod orc_reader_options_builder builder( diff --git a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd index db6cb0cdfa5..9302ffe2f80 100644 --- a/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/io/orc_metadata.pxd @@ -1,11 +1,11 @@ # Copyright (c) 2020-2024, NVIDIA CORPORATION. -cimport pylibcudf.libcudf.io.types as cudf_io_types from libc.stdint cimport int32_t, int64_t, uint32_t, uint64_t from libcpp cimport bool from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector +from pylibcudf.libcudf.io cimport types as cudf_io_types from pylibcudf.variant cimport monostate, variant diff --git a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd index 1ea10e8a194..89bddbffab5 100644 --- a/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd +++ b/python/pylibcudf/pylibcudf/libcudf/partitioning.pxd @@ -25,3 +25,10 @@ cdef extern from "cudf/partitioning.hpp" namespace "cudf" nogil: const column_view& partition_map, int num_partitions ) except + + + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] \ + round_robin_partition "cudf::round_robin_partition" ( + const table_view& input, + int num_partitions, + int start_partition + ) except + diff --git a/python/pylibcudf/pylibcudf/lists.pyx b/python/pylibcudf/pylibcudf/lists.pyx index 947caddc485..6f82124d06e 100644 --- a/python/pylibcudf/pylibcudf/lists.pyx +++ b/python/pylibcudf/pylibcudf/lists.pyx @@ -52,6 +52,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): All other columns will be duplicated for each element in the list. + For details, see :cpp:func:`explode_outer`. + Parameters ---------- input : Table @@ -75,6 +77,8 @@ cpdef Table explode_outer(Table input, size_type explode_column_idx): cpdef Column concatenate_rows(Table input): """Concatenate multiple lists columns into a single lists column row-wise. + For details, see :cpp:func:`concatenate_list_elements`. + Parameters ---------- input : Table @@ -96,6 +100,8 @@ cpdef Column concatenate_rows(Table input): cpdef Column concatenate_list_elements(Column input, bool dropna): """Concatenate multiple lists on the same row into a single list. + For details, see :cpp:func:`concatenate_list_elements`. + Parameters ---------- input : Column @@ -168,6 +174,8 @@ cpdef Column contains_nulls(Column input): """Create a column of bool values indicating whether each row in the lists column contains a null value. + For details, see :cpp:func:`contains_nulls`. + Parameters ---------- input : Column @@ -290,6 +298,8 @@ cpdef Column segmented_gather(Column input, Column gather_map_list): cpdef Column extract_list_element(Column input, ColumnOrSizeType index): """Create a column of extracted list elements. + For details, see :cpp:func:`extract_list_element`. + Parameters ---------- input : Column @@ -318,6 +328,8 @@ cpdef Column count_elements(Column input): list element in the given lists column. For details, see :cpp:func:`count_elements`. + For details, see :cpp:func:`count_elements`. + Parameters ---------- input : Column diff --git a/python/pylibcudf/pylibcudf/merge.pyx b/python/pylibcudf/pylibcudf/merge.pyx index a7d43c9d158..6d707b67449 100644 --- a/python/pylibcudf/pylibcudf/merge.pyx +++ b/python/pylibcudf/pylibcudf/merge.pyx @@ -19,6 +19,8 @@ cpdef Table merge ( ): """Merge a set of sorted tables. + For details see :cpp:func:`merge`. + Parameters ---------- tables_to_merge : list diff --git a/python/pylibcudf/pylibcudf/partitioning.pxd b/python/pylibcudf/pylibcudf/partitioning.pxd new file mode 100644 index 00000000000..aad60149fc4 --- /dev/null +++ b/python/pylibcudf/pylibcudf/partitioning.pxd @@ -0,0 +1,19 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from .column cimport Column +from .table cimport Table + + +cpdef tuple[Table, list] hash_partition( + Table input, + list columns_to_hash, + int num_partitions +) + +cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions) + +cpdef tuple[Table, list] round_robin_partition( + Table input, + int num_partitions, + int start_partition=* +) diff --git a/python/pylibcudf/pylibcudf/partitioning.pyx b/python/pylibcudf/pylibcudf/partitioning.pyx new file mode 100644 index 00000000000..8fa70daab5a --- /dev/null +++ b/python/pylibcudf/pylibcudf/partitioning.pyx @@ -0,0 +1,120 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +cimport pylibcudf.libcudf.types as libcudf_types +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from libcpp.vector cimport vector +from pylibcudf.libcudf cimport partitioning as cpp_partitioning +from pylibcudf.libcudf.table.table cimport table + +from .column cimport Column +from .table cimport Table + + +cpdef tuple[Table, list] hash_partition( + Table input, + list columns_to_hash, + int num_partitions +): + """ + Partitions rows from the input table into multiple output tables. + + For details, see :cpp:func:`hash_partition`. + + Parameters + ---------- + input : Table + The table to partition + columns_to_hash : list[int] + Indices of input columns to hash + num_partitions : int + The number of partitions to use + + Returns + ------- + tuple[Table, list[int]] + An output table and a vector of row offsets to each partition + """ + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash + cdef int c_num_partitions = num_partitions + + with nogil: + c_result = move( + cpp_partitioning.hash_partition( + input.view(), c_columns_to_hash, c_num_partitions + ) + ) + + return Table.from_libcudf(move(c_result.first)), list(c_result.second) + +cpdef tuple[Table, list] partition(Table t, Column partition_map, int num_partitions): + """ + Partitions rows of `t` according to the mapping specified by `partition_map`. + + For details, see :cpp:func:`partition`. + + Parameters + ---------- + t : Table + The table to partition + partition_map : Column + Non-nullable column of integer values that map each row + in `t` to it's partition. + num_partitions : int + The total number of partitions + + Returns + ------- + tuple[Table, list[int]] + An output table and a list of row offsets to each partition + """ + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + cdef int c_num_partitions = num_partitions + + with nogil: + c_result = move( + cpp_partitioning.partition(t.view(), partition_map.view(), c_num_partitions) + ) + + return Table.from_libcudf(move(c_result.first)), list(c_result.second) + + +cpdef tuple[Table, list] round_robin_partition( + Table input, + int num_partitions, + int start_partition=0 +): + """ + Round-robin partition. + + For details, see :cpp:func:`round_robin_partition`. + + Parameters + ---------- + input : Table + The input table to be round-robin partitioned + num_partitions : int + Number of partitions for the table + start_partition : int, default 0 + Index of the 1st partition + + Returns + ------- + tuple[Table, list[int]] + The partitioned table and the partition offsets + for each partition within the table. + """ + cdef pair[unique_ptr[table], vector[libcudf_types.size_type]] c_result + cdef int c_num_partitions = num_partitions + cdef int c_start_partition = start_partition + + with nogil: + c_result = move( + cpp_partitioning.round_robin_partition( + input.view(), c_num_partitions, c_start_partition + ) + ) + + return Table.from_libcudf(move(c_result.first)), list(c_result.second) diff --git a/python/pylibcudf/pylibcudf/quantiles.pyx b/python/pylibcudf/pylibcudf/quantiles.pyx index b847ade774d..3a771fbe7ef 100644 --- a/python/pylibcudf/pylibcudf/quantiles.pyx +++ b/python/pylibcudf/pylibcudf/quantiles.pyx @@ -30,6 +30,8 @@ cpdef Column quantile( Computes the specified quantiles by interpolating values between which they lie, using the interpolation strategy specified in interp. + For details see :cpp:func:`quantile`. + Parameters ---------- input: Column @@ -91,6 +93,8 @@ cpdef Table quantiles( specified quantiles. In the event a quantile lies in between rows, the specified interpolation strategy is used to pick between the rows. + For details see :cpp:func:`quantiles`. + Parameters ---------- input: Table diff --git a/python/pylibcudf/pylibcudf/reshape.pyx b/python/pylibcudf/pylibcudf/reshape.pyx index a99145be900..eb1499ebbea 100644 --- a/python/pylibcudf/pylibcudf/reshape.pyx +++ b/python/pylibcudf/pylibcudf/reshape.pyx @@ -23,6 +23,8 @@ cpdef Column interleave_columns(Table source_table): in = [[A1, A2, A3], [B1, B2, B3]] return = [A1, B1, A2, B2, A3, B3] + For details, see :cpp:func:`interleave_columns`. + Parameters ---------- source_table: Table @@ -44,6 +46,8 @@ cpdef Column interleave_columns(Table source_table): cpdef Table tile(Table source_table, size_type count): """Repeats the rows from input table count times to form a new table. + For details, see :cpp:func:`tile`. + Parameters ---------- source_table: Table diff --git a/python/pylibcudf/pylibcudf/search.pyx b/python/pylibcudf/pylibcudf/search.pyx index ff2468f3f9c..814bc6553d8 100644 --- a/python/pylibcudf/pylibcudf/search.pyx +++ b/python/pylibcudf/pylibcudf/search.pyx @@ -19,6 +19,8 @@ cpdef Column lower_bound( ): """Find smallest indices in haystack where needles may be inserted to retain order. + For details, see :cpp:func:`lower_bound`. + Parameters ---------- haystack : Table @@ -58,6 +60,8 @@ cpdef Column upper_bound( ): """Find largest indices in haystack where needles may be inserted to retain order. + For details, see :cpp:func:`upper_bound`. + Parameters ---------- haystack : Table @@ -92,6 +96,8 @@ cpdef Column upper_bound( cpdef Column contains(Column haystack, Column needles): """Check whether needles are present in haystack. + For details, see :cpp:func:`contains`. + Parameters ---------- haystack : Table diff --git a/python/pylibcudf/pylibcudf/sorting.pyx b/python/pylibcudf/pylibcudf/sorting.pyx index bd173eebacb..42289d54bca 100644 --- a/python/pylibcudf/pylibcudf/sorting.pyx +++ b/python/pylibcudf/pylibcudf/sorting.pyx @@ -16,6 +16,8 @@ from .table cimport Table cpdef Column sorted_order(Table source_table, list column_order, list null_precedence): """Computes the row indices required to sort the table. + For details, see :cpp:func:`sorted_order`. + Parameters ---------- source_table : Table @@ -52,6 +54,8 @@ cpdef Column stable_sorted_order( """Computes the row indices required to sort the table, preserving order of equal elements. + For details, see :cpp:func:`stable_sorted_order`. + Parameters ---------- source_table : Table @@ -90,6 +94,8 @@ cpdef Column rank( ): """Computes the rank of each element in the column. + For details, see :cpp:func:`rank`. + Parameters ---------- input_view : Column @@ -128,6 +134,8 @@ cpdef Column rank( cpdef bool is_sorted(Table tbl, list column_order, list null_precedence): """Checks if the table is sorted. + For details, see :cpp:func:`is_sorted`. + Parameters ---------- tbl : Table @@ -165,6 +173,8 @@ cpdef Table segmented_sort_by_key( ): """Sorts the table by key, within segments. + For details, see :cpp:func:`segmented_sort_by_key`. + Parameters ---------- values : Table @@ -209,6 +219,8 @@ cpdef Table stable_segmented_sort_by_key( """Sorts the table by key preserving order of equal elements, within segments. + For details, see :cpp:func:`stable_segmented_sort_by_key`. + Parameters ---------- values : Table @@ -251,6 +263,8 @@ cpdef Table sort_by_key( ): """Sorts the table by key. + For details, see :cpp:func:`sort_by_key`. + Parameters ---------- values : Table @@ -290,6 +304,8 @@ cpdef Table stable_sort_by_key( ): """Sorts the table by key preserving order of equal elements. + For details, see :cpp:func:`stable_sort_by_key`. + Parameters ---------- values : Table @@ -324,6 +340,8 @@ cpdef Table stable_sort_by_key( cpdef Table sort(Table source_table, list column_order, list null_precedence): """Sorts the table. + For details, see :cpp:func:`sort`. + Parameters ---------- source_table : Table @@ -355,6 +373,8 @@ cpdef Table sort(Table source_table, list column_order, list null_precedence): cpdef Table stable_sort(Table source_table, list column_order, list null_precedence): """Sorts the table preserving order of equal elements. + For details, see :cpp:func:`stable_sort`. + Parameters ---------- source_table : Table diff --git a/python/pylibcudf/pylibcudf/stream_compaction.pyx b/python/pylibcudf/pylibcudf/stream_compaction.pyx index b574bfa9fa2..d5475ea79d5 100644 --- a/python/pylibcudf/pylibcudf/stream_compaction.pyx +++ b/python/pylibcudf/pylibcudf/stream_compaction.pyx @@ -25,6 +25,8 @@ from .table cimport Table cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): """Filters out rows from the input table based on the presence of nulls. + For details, see :cpp:func:`drop_nulls`. + Parameters ---------- source_table : Table @@ -53,6 +55,8 @@ cpdef Table drop_nulls(Table source_table, list keys, size_type keep_threshold): cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): """Filters out rows from the input table based on the presence of NaNs. + For details, see :cpp:func:`drop_nans`. + Parameters ---------- source_table : Table @@ -81,6 +85,8 @@ cpdef Table drop_nans(Table source_table, list keys, size_type keep_threshold): cpdef Table apply_boolean_mask(Table source_table, Column boolean_mask): """Filters out rows from the input table based on a boolean mask. + For details, see :cpp:func:`apply_boolean_mask`. + Parameters ---------- source_table : Table @@ -111,6 +117,8 @@ cpdef Table unique( ): """Filter duplicate consecutive rows from the input table. + For details, see :cpp:func:`unique`. + Parameters ---------- input : Table @@ -153,6 +161,8 @@ cpdef Table distinct( ): """Get the distinct rows from the input table. + For details, see :cpp:func:`distinct`. + Parameters ---------- input : Table @@ -191,6 +201,8 @@ cpdef Column distinct_indices( ): """Get the indices of the distinct rows from the input table. + For details, see :cpp:func:`distinct_indices`. + Parameters ---------- input : Table @@ -226,6 +238,8 @@ cpdef Table stable_distinct( ): """Get the distinct rows from the input table, preserving input order. + For details, see :cpp:func:`stable_distinct`. + Parameters ---------- input : Table @@ -263,6 +277,8 @@ cpdef size_type unique_count( ): """Returns the number of unique consecutive elements in the input column. + For details, see :cpp:func:`unique_count`. + Parameters ---------- source : Column @@ -294,6 +310,8 @@ cpdef size_type distinct_count( ): """Returns the number of distinct elements in the input column. + For details, see :cpp:func:`distinct_count`. + Parameters ---------- source : Column diff --git a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt index 77f20b0b917..142bc124ca2 100644 --- a/python/pylibcudf/pylibcudf/strings/CMakeLists.txt +++ b/python/pylibcudf/pylibcudf/strings/CMakeLists.txt @@ -13,8 +13,21 @@ # ============================================================================= set(cython_sources - capitalize.pyx case.pyx char_types.pyx contains.pyx extract.pyx find.pyx findall.pyx - regex_flags.pyx regex_program.pyx repeat.pyx replace.pyx side_type.pyx slice.pyx strip.pyx + attributes.pyx + capitalize.pyx + case.pyx + char_types.pyx + contains.pyx + extract.pyx + find.pyx + findall.pyx + regex_flags.pyx + regex_program.pyx + repeat.pyx + replace.pyx + side_type.pyx + slice.pyx + strip.pyx ) set(linked_libraries cudf::cudf) diff --git a/python/pylibcudf/pylibcudf/strings/__init__.pxd b/python/pylibcudf/pylibcudf/strings/__init__.pxd index 91d884b294b..d8afccc7336 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.pxd +++ b/python/pylibcudf/pylibcudf/strings/__init__.pxd @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . cimport ( + attributes, capitalize, case, char_types, @@ -16,3 +17,21 @@ from . cimport ( strip, ) from .side_type cimport side_type + +__all__ = [ + "attributes", + "capitalize", + "case", + "char_types", + "contains", + "convert", + "extract", + "find", + "findall", + "regex_flags", + "regex_program", + "replace", + "slice", + "strip", + "side_type", +] diff --git a/python/pylibcudf/pylibcudf/strings/__init__.py b/python/pylibcudf/pylibcudf/strings/__init__.py index b4856784390..22452812e42 100644 --- a/python/pylibcudf/pylibcudf/strings/__init__.py +++ b/python/pylibcudf/pylibcudf/strings/__init__.py @@ -1,6 +1,7 @@ # Copyright (c) 2024, NVIDIA CORPORATION. from . import ( + attributes, capitalize, case, char_types, @@ -17,3 +18,21 @@ strip, ) from .side_type import SideType + +__all__ = [ + "attributes", + "capitalize", + "case", + "char_types", + "contains", + "convert", + "extract", + "find", + "findall", + "regex_flags", + "regex_program", + "replace", + "slice", + "strip", + "SideType", +] diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pxd b/python/pylibcudf/pylibcudf/strings/attributes.pxd new file mode 100644 index 00000000000..27398766924 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/attributes.pxd @@ -0,0 +1,10 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from pylibcudf.column cimport Column + + +cpdef Column count_characters(Column source_strings) + +cpdef Column count_bytes(Column source_strings) + +cpdef Column code_points(Column source_strings) diff --git a/python/pylibcudf/pylibcudf/strings/attributes.pyx b/python/pylibcudf/pylibcudf/strings/attributes.pyx new file mode 100644 index 00000000000..36bee7bd1d9 --- /dev/null +++ b/python/pylibcudf/pylibcudf/strings/attributes.pyx @@ -0,0 +1,76 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move +from pylibcudf.column cimport Column +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.strings cimport attributes as cpp_attributes + + +cpdef Column count_characters(Column source_strings): + """ + Returns a column containing character lengths of each string + in the given column. + + Parameters + ---------- + source_strings : Column + Column of strings. + + Returns + ------- + Column + New column with lengths for each string + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_attributes.count_characters(source_strings.view())) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column count_bytes(Column source_strings): + """ + Returns a column containing byte lengths of each string + in the given column. + + Parameters + ---------- + source_strings : Column + Column of strings. + + Returns + ------- + Column + New column with the number of bytes for each string + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_attributes.count_bytes(source_strings.view())) + + return Column.from_libcudf(move(c_result)) + + +cpdef Column code_points(Column source_strings): + """ + Creates a numeric column with code point values (integers) + for each character of each string. + + Parameters + ---------- + source_strings : Column + Column of strings. + + Returns + ------- + Column + New column with code point integer values for each character + """ + cdef unique_ptr[column] c_result + + with nogil: + c_result = move(cpp_attributes.code_points(source_strings.view())) + + return Column.from_libcudf(move(c_result)) diff --git a/python/pylibcudf/pylibcudf/strings/findall.pyx b/python/pylibcudf/pylibcudf/strings/findall.pyx index 03ecb13a50e..3a6b87504b3 100644 --- a/python/pylibcudf/pylibcudf/strings/findall.pyx +++ b/python/pylibcudf/pylibcudf/strings/findall.pyx @@ -13,7 +13,7 @@ cpdef Column findall(Column input, RegexProgram pattern): Returns a lists column of strings for each matching occurrence using the regex_program pattern within each string. - For details, see For details, see :cpp:func:`cudf::strings::findall`. + For details, see :cpp:func:`cudf::strings::findall`. Parameters ---------- diff --git a/python/pylibcudf/pylibcudf/tests/common/utils.py b/python/pylibcudf/pylibcudf/tests/common/utils.py index babe6634318..9f389fa42c4 100644 --- a/python/pylibcudf/pylibcudf/tests/common/utils.py +++ b/python/pylibcudf/pylibcudf/tests/common/utils.py @@ -9,6 +9,7 @@ import pyarrow.compute as pc import pylibcudf as plc import pytest +from pyarrow.orc import write_table as orc_write_table from pyarrow.parquet import write_table as pq_write_table from pylibcudf.io.types import CompressionType @@ -242,13 +243,21 @@ def is_nested_list(typ): return nesting_level(typ)[0] > 1 -def _convert_numeric_types_to_floating(pa_table): +def _convert_types(pa_table, input_pred, result_type): """ - Useful little helper for testing the - dtypes option in I/O readers. + Useful little helper for testing the dtypes option in I/O readers. - Returns a tuple containing the pylibcudf dtypes - and the new pyarrow schema + Returns a tuple containing the pylibcudf dtypes and the new pyarrow schema based on + the data in the table. + + Parameters + ---------- + pa_table : pyarrow.Table + The table from which to extract the dtypes + input_pred : function + Predicate that evaluates to true for types to replace + result_type : pa.DataType + The type to cast to """ dtypes = [] new_fields = [] @@ -257,11 +266,9 @@ def _convert_numeric_types_to_floating(pa_table): child_types = [] plc_type = plc.interop.from_arrow(field.type) - if pa.types.is_integer(field.type) or pa.types.is_unsigned_integer( - field.type - ): - plc_type = plc.interop.from_arrow(pa.float64()) - field = field.with_type(pa.float64()) + if input_pred(field.type): + plc_type = plc.interop.from_arrow(result_type) + field = field.with_type(result_type) dtypes.append((field.name, plc_type, child_types)) @@ -332,6 +339,16 @@ def make_source(path_or_buf, pa_table, format, **kwargs): if isinstance(path_or_buf, io.IOBase) else path_or_buf, ) + elif format == "orc": + # The conversion to pandas is lossy (doesn't preserve + # nested types) so we + # will just use pyarrow directly to write this + orc_write_table( + pa_table, + pa.PythonFile(path_or_buf) + if isinstance(path_or_buf, io.IOBase) + else path_or_buf, + ) if isinstance(path_or_buf, io.IOBase): path_or_buf.seek(0) return path_or_buf diff --git a/python/pylibcudf/pylibcudf/tests/io/test_csv.py b/python/pylibcudf/pylibcudf/tests/io/test_csv.py index ccd7eef54f3..ab26f23418d 100644 --- a/python/pylibcudf/pylibcudf/tests/io/test_csv.py +++ b/python/pylibcudf/pylibcudf/tests/io/test_csv.py @@ -9,7 +9,7 @@ import pytest from pylibcudf.io.types import CompressionType from utils import ( - _convert_numeric_types_to_floating, + _convert_types, assert_table_and_meta_eq, make_source, write_source_str, @@ -148,7 +148,11 @@ def test_read_csv_dtypes(csv_table_data, source_or_sink, usecols): if usecols is not None: pa_table = pa_table.select(usecols) - dtypes, new_fields = _convert_numeric_types_to_floating(pa_table) + dtypes, new_fields = _convert_types( + pa_table, + lambda t: (pa.types.is_unsigned_integer(t) or pa.types.is_integer(t)), + pa.float64(), + ) # Extract the dtype out of the (name, type, child_types) tuple # (read_csv doesn't support this format since it doesn't support nested columns) dtypes = {name: dtype for name, dtype, _ in dtypes} diff --git a/python/pylibcudf/pylibcudf/tests/io/test_orc.py b/python/pylibcudf/pylibcudf/tests/io/test_orc.py new file mode 100644 index 00000000000..42b14b1feff --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_orc.py @@ -0,0 +1,53 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import _convert_types, assert_table_and_meta_eq, make_source + +# Shared kwargs to pass to make_source +_COMMON_ORC_SOURCE_KWARGS = {"format": "orc"} + + +@pytest.mark.parametrize("columns", [None, ["col_int64", "col_bool"]]) +def test_read_orc_basic( + table_data, binary_source_or_sink, nrows_skiprows, columns +): + _, pa_table = table_data + nrows, skiprows = nrows_skiprows + + # ORC reader doesn't support skip_rows for nested columns + if skiprows > 0: + colnames_to_drop = [] + for i in range(len(pa_table.schema)): + field = pa_table.schema.field(i) + + if pa.types.is_nested(field.type): + colnames_to_drop.append(field.name) + pa_table = pa_table.drop(colnames_to_drop) + # ORC doesn't support unsigned ints + # let's cast to int64 + _, new_fields = _convert_types( + pa_table, pa.types.is_unsigned_integer, pa.int64() + ) + pa_table = pa_table.cast(pa.schema(new_fields)) + + source = make_source( + binary_source_or_sink, pa_table, **_COMMON_ORC_SOURCE_KWARGS + ) + + res = plc.io.orc.read_orc( + plc.io.SourceInfo([source]), + nrows=nrows, + skip_rows=skiprows, + columns=columns, + ) + + if columns is not None: + pa_table = pa_table.select(columns) + + # Adapt to nrows/skiprows + pa_table = pa_table.slice( + offset=skiprows, length=nrows if nrows != -1 else None + ) + + assert_table_and_meta_eq(pa_table, res, check_field_nullability=False) diff --git a/python/pylibcudf/pylibcudf/tests/io/test_timezone.py b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py new file mode 100644 index 00000000000..76b0424b2af --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/io/test_timezone.py @@ -0,0 +1,16 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +import zoneinfo + +import pylibcudf as plc +import pytest + + +def test_make_timezone_transition_table(): + if len(zoneinfo.TZPATH) == 0: + pytest.skip("No TZPATH available.") + tz_path = zoneinfo.TZPATH[0] + result = plc.io.timezone.make_timezone_transition_table( + tz_path, "America/Los_Angeles" + ) + assert isinstance(result, plc.Table) + assert result.num_rows() > 0 diff --git a/python/pylibcudf/pylibcudf/tests/test_partitioning.py b/python/pylibcudf/pylibcudf/tests/test_partitioning.py new file mode 100644 index 00000000000..444d0089d2c --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_partitioning.py @@ -0,0 +1,55 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from utils import assert_table_eq + + +@pytest.fixture(scope="module") +def partitioning_data(): + data = {"a": [1, 2, 3], "b": [1, 2, 5], "c": [1, 2, 10]} + pa_table = pa.table(data) + plc_table = plc.interop.from_arrow(pa_table) + return data, plc_table, pa_table + + +def test_partition(partitioning_data): + raw_data, plc_table, pa_table = partitioning_data + result, result_offsets = plc.partitioning.partition( + plc_table, + plc.interop.from_arrow(pa.array([0, 0, 0])), + 1, + ) + expected = pa.table( + list(raw_data.values()), + schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), + ) + assert_table_eq(expected, result) + assert result_offsets == [0, 3] + + +def test_hash_partition(partitioning_data): + raw_data, plc_table, pa_table = partitioning_data + result, result_offsets = plc.partitioning.hash_partition( + plc_table, [0, 1], 1 + ) + expected = pa.table( + list(raw_data.values()), + schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), + ) + assert_table_eq(expected, result) + assert result_offsets == [0] + + +def test_round_robin_partition(partitioning_data): + raw_data, plc_table, pa_table = partitioning_data + result, result_offsets = plc.partitioning.round_robin_partition( + plc_table, 1, 0 + ) + expected = pa.table( + list(raw_data.values()), + schema=pa.schema([pa.field("", pa.int64(), nullable=False)] * 3), + ) + assert_table_eq(expected, result) + assert result_offsets == [0] diff --git a/python/pylibcudf/pylibcudf/tests/test_string_attributes.py b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py new file mode 100644 index 00000000000..a1820def0b1 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_string_attributes.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pyarrow.compute as pc +import pylibcudf as plc +import pytest +from utils import assert_column_eq + + +@pytest.fixture() +def str_data(): + pa_data = pa.array(["A", None]) + return pa_data, plc.interop.from_arrow(pa_data) + + +def test_count_characters(str_data): + result = plc.strings.attributes.count_characters(str_data[1]) + expected = pc.utf8_length(str_data[0]) + assert_column_eq(expected, result) + + +def test_count_bytes(str_data): + result = plc.strings.attributes.count_characters(str_data[1]) + expected = pc.binary_length(str_data[0]) + assert_column_eq(expected, result) + + +def test_code_points(str_data): + result = plc.strings.attributes.code_points(str_data[1]) + exp_data = [ord(str_data[0].to_pylist()[0])] + expected = pa.chunked_array([exp_data], type=pa.int32()) + assert_column_eq(expected, result) diff --git a/python/pylibcudf/pylibcudf/tests/test_transpose.py b/python/pylibcudf/pylibcudf/tests/test_transpose.py new file mode 100644 index 00000000000..ac11123f680 --- /dev/null +++ b/python/pylibcudf/pylibcudf/tests/test_transpose.py @@ -0,0 +1,32 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. + +import pyarrow as pa +import pylibcudf as plc +import pytest +from packaging.version import parse + + +@pytest.mark.skipif( + parse(pa.__version__) < parse("16.0.0"), + reason="https://github.com/apache/arrow/pull/40070", +) +@pytest.mark.parametrize( + "arr", + [ + [], + [1, 2, 3], + [1, 2], + [1], + ], +) +def test_transpose(arr): + data = {"a": arr, "b": arr} + arrow_tbl = pa.table(data) + plc_tbl = plc.interop.from_arrow(arrow_tbl) + plc_result = plc.transpose.transpose(plc_tbl) + result = plc.interop.to_arrow(plc_result) + expected = pa.Table.from_pandas( + arrow_tbl.to_pandas().T, preserve_index=False + ).rename_columns([""] * len(arr)) + expected = pa.table(expected, schema=result.schema) + assert result.equals(expected) diff --git a/python/pylibcudf/pylibcudf/transform.pyx b/python/pylibcudf/pylibcudf/transform.pyx index bcd6185521a..de425a27c15 100644 --- a/python/pylibcudf/pylibcudf/transform.pyx +++ b/python/pylibcudf/pylibcudf/transform.pyx @@ -20,6 +20,8 @@ from .utils cimport int_to_bitmask_ptr cpdef tuple[gpumemoryview, int] nans_to_nulls(Column input): """Create a null mask preserving existing nulls and converting nans to null. + For details, see :cpp:func:`nans_to_nulls`. + Parameters ---------- input : Column diff --git a/python/pylibcudf/pylibcudf/transpose.pxd b/python/pylibcudf/pylibcudf/transpose.pxd new file mode 100644 index 00000000000..7b5a7676b49 --- /dev/null +++ b/python/pylibcudf/pylibcudf/transpose.pxd @@ -0,0 +1,5 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from .table cimport Table + + +cpdef Table transpose(Table input_table) diff --git a/python/pylibcudf/pylibcudf/transpose.pyx b/python/pylibcudf/pylibcudf/transpose.pyx new file mode 100644 index 00000000000..a708f6cc37f --- /dev/null +++ b/python/pylibcudf/pylibcudf/transpose.pyx @@ -0,0 +1,38 @@ +# Copyright (c) 2024, NVIDIA CORPORATION. +from libcpp.memory cimport unique_ptr +from libcpp.pair cimport pair +from libcpp.utility cimport move +from pylibcudf.libcudf cimport transpose as cpp_transpose +from pylibcudf.libcudf.column.column cimport column +from pylibcudf.libcudf.table.table_view cimport table_view + +from .column cimport Column +from .table cimport Table + + +cpdef Table transpose(Table input_table): + """Transpose a Table. + + For details, see :cpp:func:`transpose`. + + Parameters + ---------- + input_table : Table + Table to transpose + + Returns + ------- + Table + Transposed table. + """ + cdef pair[unique_ptr[column], table_view] c_result + cdef Table owner_table + + with nogil: + c_result = move(cpp_transpose.transpose(input_table.view())) + + owner_table = Table( + [Column.from_libcudf(move(c_result.first))] * c_result.second.num_columns() + ) + + return Table.from_table_view(c_result.second, owner_table)