Merge branch 'branch-24.08' of github.com:rapidsai/cudf into pylibcud…

…f-io-writers
lithomas1 · Jun 12, 2024 · dc93356 · dc93356
2 parents c54316e + 0891c5d
commit dc93356
Show file tree

Hide file tree

Showing 319 changed files with 3,837 additions and 2,931 deletions.
diff --git a/.github/workflows/external_issue_labeler.yml b/.github/workflows/external_issue_labeler.yml
@@ -20,36 +20,41 @@ on:
     types:
       - opened
 
-  pull_request:
+  pull_request_target:
     types:
       - opened
 
 env:
   GITHUB_TOKEN: ${{ github.token }}
 
-permissions:
-  issues: write
-  pull-requests: write
-
 jobs:
   Label-Issue:
     runs-on: ubuntu-latest
-    # Only run if the issue author is not part of RAPIDS
-    if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}}
+    permissions:
+      issues: write
+    if: github.event_name == 'issues'
     steps:
       - name: add-external-labels
+        # Only run if the issue author is not part of RAPIDS
+        if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.issue.author_association)}}
         run: |
+          echo ${{ github.event.issue.author_association }}
           issue_url=${{ github.event.issue.html_url }}
           gh issue edit ${issue_url} --add-label "External"
         continue-on-error: true
 
   Label-PR:
     runs-on: ubuntu-latest
-    # Only run if the issue author is not part of RAPIDS
-    if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}}
+    permissions:
+      pull-requests: write
+      issues: write
+    if: github.event_name == 'pull_request_target'
     steps:
       - name: add-external-labels
+        # Only run if the issue author is not part of RAPIDS
+        if: ${{ ! contains(fromJSON('["OWNER", "MEMBER", "COLLABORATOR"]'), github.event.pull_request.author_association)}}
         run: |
+            echo ${{ github.event.pull_request.author_association }}
             pr_url=${{ github.event.pull_request.html_url }}
             gh issue edit ${pr_url} --add-label "External"
-    continue-on-error: true
+        continue-on-error: true
diff --git a/.github/workflows/pr_issue_status_automation.yml b/.github/workflows/pr_issue_status_automation.yml
@@ -50,7 +50,7 @@ jobs:
 
     update-sprint:
       # This job sets the PR and its linked issues to the current "Weekly Sprint"
-      uses: jarmak-nv/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
+      uses: rapidsai/shared-workflows/.github/workflows/project-get-set-iteration-field.yaml@branch-24.08
       if: github.event.pull_request.state == 'open'
       needs: get-project-id
       with:

diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
@@ -56,12 +56,20 @@ repos:
       - id: clang-format
         types_or: [c, c++, cuda]
         args: ["-fallback-style=none", "-style=file", "-i"]
+        exclude: |
+          (?x)^(
+            ^cpp/src/io/parquet/ipc/Schema_generated.h|
+            ^cpp/src/io/parquet/ipc/Message_generated.h|
+            ^cpp/include/cudf_test/cxxopts.hpp|
+          )
   - repo: https://github.com/sirosen/texthooks
     rev: 0.6.6
     hooks:
       - id: fix-smartquotes
         exclude: |
           (?x)^(
+            ^cpp/src/io/parquet/ipc/Schema_generated.h|
+            ^cpp/src/io/parquet/ipc/Message_generated.h|
             ^cpp/include/cudf_test/cxxopts.hpp|
             ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
             ^python/cudf/cudf/tests/text/test_text_methods.py

diff --git a/ci/build_cpp.sh b/ci/build_cpp.sh
@@ -13,12 +13,10 @@ export CMAKE_GENERATOR=Ninja
 
 rapids-print-env
 
-version=$(rapids-generate-version)
-
 rapids-logger "Begin cpp build"
 
 # With boa installed conda build forward to boa
-RAPIDS_PACKAGE_VERSION=${version} rapids-conda-retry mambabuild \
+RAPIDS_PACKAGE_VERSION=$(rapids-generate-version) rapids-conda-retry mambabuild \
     conda/recipes/libcudf
 
 rapids-upload-conda-to-s3 cpp
diff --git a/ci/build_docs.sh b/ci/build_docs.sh
@@ -14,7 +14,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key docs \
+  --file-key docs \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n docs

diff --git a/ci/check_style.sh b/ci/check_style.sh
@@ -10,7 +10,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key checks \
+  --file-key checks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n checks

diff --git a/ci/configure_cpp_static.sh b/ci/configure_cpp_static.sh
@@ -12,7 +12,7 @@ REQUIREMENTS_FILE="${ENV_YAML_DIR}/requirements.txt"
 
 rapids-dependency-file-generator \
   --output requirements \
-  --file_key test_static_build \
+  --file-key test_static_build \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${REQUIREMENTS_FILE}"
 
 python -m pip install -r "${REQUIREMENTS_FILE}"

diff --git a/ci/test_cpp_common.sh b/ci/test_cpp_common.sh
@@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_cpp \
+  --file-key test_cpp \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

diff --git a/ci/test_java.sh b/ci/test_java.sh
@@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_java \
+  --file-key test_java \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch)" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

diff --git a/ci/test_notebooks.sh b/ci/test_notebooks.sh
@@ -11,7 +11,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_notebooks \
+  --file-key test_notebooks \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

diff --git a/ci/test_python_common.sh b/ci/test_python_common.sh
@@ -13,7 +13,7 @@ ENV_YAML_DIR="$(mktemp -d)"
 
 rapids-dependency-file-generator \
   --output conda \
-  --file_key test_python \
+  --file-key test_python \
   --matrix "cuda=${RAPIDS_CUDA_VERSION%.*};arch=$(arch);py=${RAPIDS_PY_VERSION}" | tee "${ENV_YAML_DIR}/env.yaml"
 
 rapids-mamba-retry env create --yes -f "${ENV_YAML_DIR}/env.yaml" -n test

diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
@@ -664,6 +664,7 @@ add_library(
   src/utilities/default_stream.cpp
   src/utilities/linked_column.cpp
   src/utilities/logger.cpp
+  src/utilities/pinned_memory.cpp
   src/utilities/stacktrace.cpp
   src/utilities/stream_pool.cpp
   src/utilities/traits.cpp

diff --git a/cpp/benchmarks/fixture/nvbench_fixture.hpp b/cpp/benchmarks/fixture/nvbench_fixture.hpp
@@ -15,8 +15,8 @@
  */
 #pragma once
 
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/utilities/error.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 
 #include <rmm/cuda_device.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -81,17 +81,18 @@ struct nvbench_base_fixture {
               "\nExpecting: cuda, pool, async, arena, managed, or managed_pool");
   }
 
-  inline rmm::host_async_resource_ref make_cuio_host_pinned()
+  inline rmm::host_device_async_resource_ref make_cuio_host_pinned()
   {
     static std::shared_ptr<rmm::mr::pinned_host_memory_resource> mr =
       std::make_shared<rmm::mr::pinned_host_memory_resource>();
     return *mr;
   }
 
-  inline rmm::host_async_resource_ref create_cuio_host_memory_resource(std::string const& mode)
+  inline rmm::host_device_async_resource_ref create_cuio_host_memory_resource(
+    std::string const& mode)
   {
     if (mode == "pinned") return make_cuio_host_pinned();
-    if (mode == "pinned_pool") return cudf::io::get_host_memory_resource();
+    if (mode == "pinned_pool") return cudf::get_pinned_memory_resource();
     CUDF_FAIL("Unknown cuio_host_mem parameter: " + mode + "\nExpecting: pinned or pinned_pool");
   }
 
@@ -112,14 +113,14 @@ struct nvbench_base_fixture {
     rmm::mr::set_current_device_resource(mr.get());
     std::cout << "RMM memory resource = " << rmm_mode << "\n";
 
-    cudf::io::set_host_memory_resource(create_cuio_host_memory_resource(cuio_host_mode));
+    cudf::set_pinned_memory_resource(create_cuio_host_memory_resource(cuio_host_mode));
     std::cout << "CUIO host memory resource = " << cuio_host_mode << "\n";
   }
 
   ~nvbench_base_fixture()
   {
     // Ensure the the pool is freed before the CUDA context is destroyed:
-    cudf::io::set_host_memory_resource(this->make_cuio_host_pinned());
+    cudf::set_pinned_memory_resource(this->make_cuio_host_pinned());
   }
 
   std::shared_ptr<rmm::mr::device_memory_resource> mr;

diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
@@ -19,6 +19,9 @@
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/detail/utilities/logger.hpp>
 
+#include <rmm/mr/pinned_host_memory_resource.hpp>
+#include <rmm/resource_ref.hpp>
+
 #include <unistd.h>
 
 #include <cstdio>
@@ -28,6 +31,14 @@
 
 temp_directory const cuio_source_sink_pair::tmpdir{"cudf_gbench"};
 
+// Don't use cudf's pinned pool for the source data
+rmm::host_async_resource_ref pinned_memory_resource()
+{
+  static rmm::mr::pinned_host_memory_resource mr = rmm::mr::pinned_host_memory_resource{};
+
+  return mr;
+}
+
 std::string random_file_in_dir(std::string const& dir_path)
 {
   // `mkstemp` modifies the template in place
@@ -41,6 +52,7 @@ std::string random_file_in_dir(std::string const& dir_path)
 
 cuio_source_sink_pair::cuio_source_sink_pair(io_type type)
   : type{type},
+    pinned_buffer({pinned_memory_resource(), cudf::get_default_stream()}),
     d_buffer{0, cudf::get_default_stream()},
     file_name{random_file_in_dir(tmpdir.path())},
     void_sink{cudf::io::data_sink::create()}

diff --git a/cpp/benchmarks/io/cuio_common.hpp b/cpp/benchmarks/io/cuio_common.hpp
@@ -18,7 +18,7 @@
 
 #include <cudf_test/file_utilities.hpp>
 
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
+#include <cudf/detail/utilities/host_vector.hpp>
 #include <cudf/io/data_sink.hpp>
 #include <cudf/io/datasource.hpp>
 
@@ -79,7 +79,7 @@ class cuio_source_sink_pair {
 
   io_type const type;
   std::vector<char> h_buffer;
-  cudf::detail::pinned_host_vector<char> pinned_buffer;
+  cudf::detail::host_vector<char> pinned_buffer;
   rmm::device_uvector<std::byte> d_buffer;
   std::string const file_name;
   std::unique_ptr<cudf::io::data_sink> void_sink;

diff --git a/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp b/cpp/benchmarks/io/parquet/parquet_reader_multithread.cpp
@@ -20,9 +20,9 @@
 #include <benchmarks/io/nvbench_helpers.hpp>
 
 #include <cudf/detail/utilities/stream_pool.hpp>
-#include <cudf/io/memory_resource.hpp>
 #include <cudf/io/parquet.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/pinned_memory.hpp>
 #include <cudf/utilities/thread_pool.hpp>
 
 #include <nvtx3/nvtx3.hpp>

diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,7 +22,6 @@
 #include <cudf_test/file_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/utilities/pinned_host_vector.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/detail/bgzip_utils.hpp>
@@ -132,9 +131,10 @@ static void bench_multibyte_split(nvbench::state& state,
 
   auto const delim_factor = static_cast<double>(delim_percent) / 100;
   std::unique_ptr<cudf::io::datasource> datasource;
-  auto device_input      = create_random_input(file_size_approx, delim_factor, 0.05, delim);
-  auto host_input        = std::vector<char>{};
-  auto host_pinned_input = cudf::detail::pinned_host_vector<char>{};
+  auto device_input = create_random_input(file_size_approx, delim_factor, 0.05, delim);
+  auto host_input   = std::vector<char>{};
+  auto host_pinned_input =
+    cudf::detail::make_pinned_vector_async<char>(0, cudf::get_default_stream());
 
   if (source_type != data_chunk_source_type::device &&
       source_type != data_chunk_source_type::host_pinned) {

diff --git a/cpp/include/cudf/ast/expressions.hpp b/cpp/include/cudf/ast/expressions.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2023, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2024, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -478,7 +478,10 @@ class operation : public expression {
    *
    * @return Vector of operands
    */
-  std::vector<std::reference_wrapper<expression const>> get_operands() const { return operands; }
+  [[nodiscard]] std::vector<std::reference_wrapper<expression const>> get_operands() const
+  {
+    return operands;
+  }
 
   /**
    * @copydoc expression::accept

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
@@ -442,7 +442,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return string_view instance representing this element at this index
    */
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, string_view>)>
-  __device__ T element(size_type element_index) const noexcept
+  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
     char const* d_strings = static_cast<char const*>(_data);
@@ -501,7 +501,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return dictionary32 instance representing this element at this index
    */
   template <typename T, CUDF_ENABLE_IF(std::is_same_v<T, dictionary32>)>
-  __device__ T element(size_type element_index) const noexcept
+  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
   {
     size_type index    = element_index + offset();  // account for this view's _offset
     auto const indices = d_children[0];
@@ -519,7 +519,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    * @return numeric::fixed_point representing the element at this index
    */
   template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_point<T>())>
-  __device__ T element(size_type element_index) const noexcept
+  __device__ [[nodiscard]] T element(size_type element_index) const noexcept
   {
     using namespace numeric;
     using rep        = typename T::rep;
@@ -858,7 +858,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    */
   [[nodiscard]] __device__ device_span<column_device_view const> children() const noexcept
   {
-    return device_span<column_device_view const>(d_children, _num_children);
+    return {d_children, static_cast<std::size_t>(_num_children)};
   }
 
   /**
@@ -1032,7 +1032,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
    * @return Reference to the element at the specified index
    */
   template <typename T, CUDF_ENABLE_IF(is_rep_layout_compatible<T>())>
-  __device__ T& element(size_type element_index) const noexcept
+  __device__ [[nodiscard]] T& element(size_type element_index) const noexcept
   {
     return data<T>()[element_index];
   }