From 9be4de53268d49665bc0d700f12f1192207fff79 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Thu, 9 Nov 2023 16:00:43 -0800 Subject: [PATCH 01/17] Upgrade to nvCOMP 3.0.4 (#13815) Update the nvCOMP version used for cuIO compression/decompression to 3.0.4. Authors: - Vukasin Milovanovic (https://github.com/vuule) - Bradley Dice (https://github.com/bdice) Approvers: - Bradley Dice (https://github.com/bdice) - Ray Douglass (https://github.com/raydouglass) URL: https://github.com/rapidsai/cudf/pull/13815 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +- conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +- conda/recipes/libcudf/conda_build_config.yaml | 2 +- dependencies.yaml | 2 +- 4 files changed, 4 insertions(+), 4 deletions(-) diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index 4d5c56e4a7d..a479d517c24 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -60,7 +60,7 @@ dependencies: - numpy>=1.21,<1.25 - numpydoc - nvcc_linux-64=11.8 -- nvcomp==2.6.1 +- nvcomp==3.0.4 - nvtx>=0.2.1 - packaging - pandas>=1.3,<1.6.0dev0 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index 8606932dae4..d1779aaeeac 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -59,7 +59,7 @@ dependencies: - numba>=0.57,<0.58 - numpy>=1.21,<1.25 - numpydoc -- nvcomp==2.6.1 +- nvcomp==3.0.4 - nvtx>=0.2.1 - packaging - pandas>=1.3,<1.6.0dev0 diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml index 05b2135184b..fa06ed048b7 100644 --- a/conda/recipes/libcudf/conda_build_config.yaml +++ b/conda/recipes/libcudf/conda_build_config.yaml @@ -38,7 +38,7 @@ spdlog_version: - ">=1.11.0,<1.12" nvcomp_version: - - "=2.6.1" + - "=3.0.4" zlib_version: - ">=1.2.13" diff --git a/dependencies.yaml b/dependencies.yaml index 35d08239a4c..3850347aa63 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -244,7 +244,7 @@ dependencies: - libarrow-all==14.0.1.* - librdkafka>=1.9.0,<1.10.0a0 # Align nvcomp version with rapids-cmake - - nvcomp==2.6.1 + - nvcomp==3.0.4 - spdlog>=1.11.0,<1.12 build_wheels: common: From 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Thu, 9 Nov 2023 16:54:25 -0800 Subject: [PATCH 02/17] Remove Cython libcpp wrappers (#14382) All of these wrappers have now been upstreamed into Cython as of Cython 3.0.3. Contributes to #14023 Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - GALI PREM SAGAR (https://github.com/galipremsagar) - Bradley Dice (https://github.com/bdice) - Jake Awe (https://github.com/AyodeAwe) URL: https://github.com/rapidsai/cudf/pull/14382 --- .../all_cuda-118_arch-x86_64.yaml | 2 +- .../all_cuda-120_arch-x86_64.yaml | 2 +- conda/recipes/cudf/meta.yaml | 2 +- conda/recipes/cudf_kafka/meta.yaml | 2 +- dependencies.yaml | 2 +- python/cudf/cudf/_lib/column.pyx | 3 +- python/cudf/cudf/_lib/concat.pyx | 3 +- python/cudf/cudf/_lib/copying.pyx | 5 +- python/cudf/cudf/_lib/cpp/copying.pxd | 2 +- python/cudf/cudf/_lib/cpp/groupby.pxd | 4 +- python/cudf/cudf/_lib/cpp/io/orc.pxd | 2 +- python/cudf/cudf/_lib/cpp/io/parquet.pxd | 4 +- python/cudf/cudf/_lib/cpp/io/timezone.pxd | 2 +- python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd | 0 python/cudf/cudf/_lib/cpp/libcpp/__init__.py | 0 .../cudf/cudf/_lib/cpp/libcpp/functional.pxd | 7 --- python/cudf/cudf/_lib/cpp/libcpp/memory.pxd | 12 ----- python/cudf/cudf/_lib/cpp/libcpp/optional.pxd | 50 ------------------- python/cudf/cudf/_lib/expressions.pyx | 3 +- python/cudf/cudf/_lib/groupby.pyx | 3 +- python/cudf/cudf/_lib/join.pyx | 3 +- python/cudf/cudf/_lib/null_mask.pyx | 3 +- python/cudf/cudf/_lib/parquet.pyx | 3 +- python/cudf/cudf/_lib/timezone.pyx | 2 +- python/cudf/pyproject.toml | 2 +- python/cudf_kafka/cudf_kafka/_lib/kafka.pyx | 3 +- python/cudf_kafka/pyproject.toml | 2 +- 27 files changed, 27 insertions(+), 101 deletions(-) delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/__init__.py delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/functional.pxd delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/optional.pxd diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index a479d517c24..adf4fcad32d 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -24,7 +24,7 @@ dependencies: - cudatoolkit - cupy>=12.0.0 - cxx-compiler -- cython>=3.0.0 +- cython>=3.0.3 - dask-core>=2023.9.2 - dask-cuda==23.12.* - dask>=2023.9.2 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index d1779aaeeac..a69ef587570 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -26,7 +26,7 @@ dependencies: - cuda-version=12.0 - cupy>=12.0.0 - cxx-compiler -- cython>=3.0.0 +- cython>=3.0.3 - dask-core>=2023.9.2 - dask-cuda==23.12.* - dask>=2023.9.2 diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml index 9b5c5f3d14b..27edde1c98a 100644 --- a/conda/recipes/cudf/meta.yaml +++ b/conda/recipes/cudf/meta.yaml @@ -57,7 +57,7 @@ requirements: host: - protobuf ==4.24.* - python - - cython >=3.0.0 + - cython >=3.0.3 - scikit-build >=0.13.1 - setuptools - dlpack >=0.5,<0.6.0a0 diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index cdc547b4d68..9440f8bf124 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -51,7 +51,7 @@ requirements: {% endif %} host: - python - - cython >=3.0.0 + - cython >=3.0.3 - cuda-version ={{ cuda_version }} - cudf ={{ version }} - libcudf_kafka ={{ version }} diff --git a/dependencies.yaml b/dependencies.yaml index 3850347aa63..a16b51f4483 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -256,7 +256,7 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - cython>=3.0.0 + - cython>=3.0.3 # TODO: Pin to numpy<1.25 until cudf requires pandas 2 - &numpy numpy>=1.21,<1.25 - output_types: [conda, requirements, pyproject] diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx index f751d73b142..0edf9f8aa95 100644 --- a/python/cudf/cudf/_lib/column.pyx +++ b/python/cudf/cudf/_lib/column.pyx @@ -24,7 +24,7 @@ from cudf.utils.dtypes import _get_base_dtype from cpython.buffer cimport PyObject_CheckBuffer from libc.stdint cimport uintptr_t -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector @@ -47,7 +47,6 @@ from cudf._lib.cpp.column.column_factories cimport ( make_numeric_column, ) from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.scalar cimport DeviceScalar diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx index feaf75ef237..1ec4719631e 100644 --- a/python/cudf/cudf/_lib/concat.pyx +++ b/python/cudf/cudf/_lib/concat.pyx @@ -1,7 +1,7 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.utility cimport move from libcpp.vector cimport vector @@ -12,7 +12,6 @@ from cudf._lib.cpp.concatenate cimport ( concatenate_masks as libcudf_concatenate_masks, concatenate_tables as libcudf_concatenate_tables, ) -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.table.table cimport table, table_view from cudf._lib.utils cimport ( data_from_unique_ptr, diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx index f57bc15ed57..ea6ee76c14a 100644 --- a/python/cudf/cudf/_lib/copying.pyx +++ b/python/cudf/cudf/_lib/copying.pyx @@ -24,12 +24,13 @@ from cudf._lib.utils cimport table_view_from_columns, table_view_from_table from cudf._lib.reduce import minmax from cudf.core.abc import Serializable +from libcpp.functional cimport reference_wrapper +from libcpp.memory cimport make_unique + cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split cimport cudf._lib.cpp.copying as cpp_copying from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.lists.gather cimport ( segmented_gather as cpp_segmented_gather, ) diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd index 20725c252fc..5637b55ac1c 100644 --- a/python/cudf/cudf/_lib/cpp/copying.pxd +++ b/python/cudf/cudf/_lib/cpp/copying.pxd @@ -2,6 +2,7 @@ from libc.stdint cimport int32_t, int64_t, uint8_t from libcpp cimport bool +from libcpp.functional cimport reference_wrapper from libcpp.memory cimport unique_ptr from libcpp.vector cimport vector @@ -9,7 +10,6 @@ from rmm._lib.device_buffer cimport device_buffer from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table from cudf._lib.cpp.table.table_view cimport table_view diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd index 2ecdf76842f..0266404fc50 100644 --- a/python/cudf/cudf/_lib/cpp/groupby.pxd +++ b/python/cudf/cudf/_lib/cpp/groupby.pxd @@ -1,6 +1,7 @@ -# Copyright (c) 2020-2021, NVIDIA CORPORATION. +# Copyright (c) 2020-2023, NVIDIA CORPORATION. from libcpp cimport bool +from libcpp.functional cimport reference_wrapper from libcpp.memory cimport unique_ptr from libcpp.pair cimport pair from libcpp.vector cimport vector @@ -11,7 +12,6 @@ from cudf._lib.cpp.aggregation cimport ( ) from cudf._lib.cpp.column.column cimport column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper from cudf._lib.cpp.replace cimport replace_policy from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd index dd6f919a74d..d5ac8574fe4 100644 --- a/python/cudf/cudf/_lib/cpp/io/orc.pxd +++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd @@ -4,12 +4,12 @@ from libc.stdint cimport uint8_t from libcpp cimport bool from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector cimport cudf._lib.cpp.io.types as cudf_io_types cimport cudf._lib.cpp.table.table_view as cudf_table_view -from cudf._lib.cpp.libcpp.optional cimport optional from cudf._lib.cpp.types cimport data_type, size_type diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd index a6a7ba034aa..cdd1bde0274 100644 --- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd +++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd @@ -2,16 +2,16 @@ from libc.stdint cimport uint8_t from libcpp cimport bool +from libcpp.functional cimport reference_wrapper from libcpp.map cimport map from libcpp.memory cimport shared_ptr, unique_ptr +from libcpp.optional cimport optional from libcpp.string cimport string from libcpp.vector cimport vector cimport cudf._lib.cpp.io.types as cudf_io_types cimport cudf._lib.cpp.table.table_view as cudf_table_view from cudf._lib.cpp.expressions cimport expression -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper -from cudf._lib.cpp.libcpp.optional cimport optional from cudf._lib.cpp.types cimport data_type, size_type diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd index ba481d9a1d3..927c2118473 100644 --- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd +++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd @@ -2,9 +2,9 @@ from libcpp cimport bool from libcpp.memory cimport unique_ptr +from libcpp.optional cimport optional from libcpp.string cimport string -from cudf._lib.cpp.libcpp.optional cimport optional from cudf._lib.cpp.table.table cimport table diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd b/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.py b/python/cudf/cudf/_lib/cpp/libcpp/__init__.py deleted file mode 100644 index e69de29bb2d..00000000000 diff --git a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd deleted file mode 100644 index f3e2d6d0878..00000000000 --- a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd +++ /dev/null @@ -1,7 +0,0 @@ -# Copyright (c) 2020, NVIDIA CORPORATION. - - -cdef extern from "" namespace "std" nogil: - cdef cppclass reference_wrapper[T]: - reference_wrapper() - reference_wrapper(T) diff --git a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd deleted file mode 100644 index 2178f1a940c..00000000000 --- a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd +++ /dev/null @@ -1,12 +0,0 @@ -# Copyright (c) 2023, NVIDIA CORPORATION. - -from libcpp.memory cimport unique_ptr - - -cdef extern from "" namespace "std" nogil: - # The Cython standard header does not have except +, so C++ - # exceptions from make_unique are not caught and translated to - # Python ones. This is not perfectly ergonomic, we always have to - # wrap make_unique in move, but at least we can catch exceptions. - # See https://github.com/cython/cython/issues/5560 - unique_ptr[T] make_unique[T](...) except + diff --git a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd deleted file mode 100644 index a78c18f3f7a..00000000000 --- a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd +++ /dev/null @@ -1,50 +0,0 @@ -# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION & -# AFFILIATES. All rights reserved. SPDX-License-Identifier: -# Apache-2.0 -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -from libcpp cimport bool - - -cdef extern from "" namespace "std" nogil: - cdef cppclass nullopt_t: - nullopt_t() - - cdef nullopt_t nullopt - - cdef cppclass optional[T]: - ctypedef T value_type - optional() - optional(nullopt_t) - optional(optional&) except + - optional(T&) except + - bool has_value() - T& value() - T& value_or[U](U& default_value) - void swap(optional&) - void reset() - T& emplace(...) - T& operator*() - optional& operator=(optional&) - optional& operator=[U](U&) - bool operator bool() - bool operator!() - bool operator==[U](optional&, U&) - bool operator!=[U](optional&, U&) - bool operator<[U](optional&, U&) - bool operator>[U](optional&, U&) - bool operator<=[U](optional&, U&) - bool operator>=[U](optional&, U&) - - optional[T] make_optional[T](...) except + diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx index 8d7545ffe15..01a080f635f 100644 --- a/python/cudf/cudf/_lib/expressions.pyx +++ b/python/cudf/cudf/_lib/expressions.pyx @@ -4,12 +4,11 @@ from enum import Enum from cython.operator cimport dereference from libc.stdint cimport int64_t -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from cudf._lib.cpp cimport expressions as libcudf_exp -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.types cimport size_type # Necessary for proper casting, see below. diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx index a26d820de6f..b3778e45cde 100644 --- a/python/cudf/cudf/_lib/groupby.pyx +++ b/python/cudf/cudf/_lib/groupby.pyx @@ -24,6 +24,8 @@ from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns from cudf._lib.scalar import as_device_scalar +from libcpp.functional cimport reference_wrapper + cimport cudf._lib.cpp.groupby as libcudf_groupby cimport cudf._lib.cpp.types as libcudf_types from cudf._lib.aggregation cimport ( @@ -33,7 +35,6 @@ from cudf._lib.aggregation cimport ( make_groupby_scan_aggregation, ) from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.libcpp.functional cimport reference_wrapper from cudf._lib.cpp.replace cimport replace_policy from cudf._lib.cpp.scalar.scalar cimport scalar from cudf._lib.cpp.table.table cimport table, table_view diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx index 416680aae24..378be978cc0 100644 --- a/python/cudf/cudf/_lib/join.pyx +++ b/python/cudf/cudf/_lib/join.pyx @@ -2,7 +2,7 @@ from cudf.core.buffer import acquire_spill_lock -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.pair cimport pair from libcpp.utility cimport move @@ -11,7 +11,6 @@ from rmm._lib.device_buffer cimport device_buffer cimport cudf._lib.cpp.join as cpp_join from cudf._lib.column cimport Column from cudf._lib.cpp.column.column cimport column -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport data_type, size_type, type_id from cudf._lib.utils cimport table_view_from_columns diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx index 5b4538629f6..1f98140d9e4 100644 --- a/python/cudf/cudf/_lib/null_mask.pyx +++ b/python/cudf/cudf/_lib/null_mask.pyx @@ -6,13 +6,12 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer from cudf.core.buffer import acquire_spill_lock, as_buffer -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.pair cimport pair from libcpp.utility cimport move from cudf._lib.column cimport Column from cudf._lib.cpp.column.column_view cimport column_view -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.null_mask cimport ( bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes, bitmask_and as cpp_bitmask_and, diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx index d8d363686cc..4acb1ce10b1 100644 --- a/python/cudf/cudf/_lib/parquet.pyx +++ b/python/cudf/cudf/_lib/parquet.pyx @@ -32,7 +32,7 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata from libc.stdint cimport uint8_t from libcpp cimport bool from libcpp.map cimport map -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string from libcpp.unordered_map cimport unordered_map from libcpp.utility cimport move @@ -52,7 +52,6 @@ from cudf._lib.cpp.io.parquet cimport ( write_parquet as parquet_writer, ) from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf._lib.cpp.table.table_view cimport table_view from cudf._lib.cpp.types cimport data_type, size_type from cudf._lib.io.datasource cimport NativeFileDatasource diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx index 4d76cbfcdb5..808d1321b0b 100644 --- a/python/cudf/cudf/_lib/timezone.pyx +++ b/python/cudf/cudf/_lib/timezone.pyx @@ -1,13 +1,13 @@ # Copyright (c) 2023, NVIDIA CORPORATION. from libcpp.memory cimport unique_ptr +from libcpp.optional cimport make_optional from libcpp.string cimport string from libcpp.utility cimport move from cudf._lib.cpp.io.timezone cimport ( make_timezone_transition_table as cpp_make_timezone_transition_table, ) -from cudf._lib.cpp.libcpp.optional cimport make_optional from cudf._lib.cpp.table.table cimport table from cudf._lib.utils cimport columns_from_unique_ptr diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml index 1c687269e55..b38970271d7 100644 --- a/python/cudf/pyproject.toml +++ b/python/cudf/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" requires = [ "cmake>=3.26.4", - "cython>=3.0.0", + "cython>=3.0.3", "ninja", "numpy>=1.21,<1.25", "protoc-wheel", diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx index 4d732478723..2fbaacff7c6 100644 --- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx +++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx @@ -3,12 +3,11 @@ from libc.stdint cimport int32_t, int64_t from libcpp cimport bool, nullptr from libcpp.map cimport map -from libcpp.memory cimport unique_ptr +from libcpp.memory cimport make_unique, unique_ptr from libcpp.string cimport string from libcpp.utility cimport move from cudf._lib.cpp.io.datasource cimport datasource -from cudf._lib.cpp.libcpp.memory cimport make_unique from cudf_kafka._lib.kafka cimport kafka_consumer diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index f5cbd480e9c..4829f06ab09 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -3,7 +3,7 @@ [build-system] requires = [ - "cython>=3.0.0", + "cython>=3.0.3", "numpy>=1.21,<1.25", "pyarrow==14.0.1.*", "setuptools", From 04d13d81b0bb4c2b3db2bfc9d9e28432e0a73c44 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Mon, 13 Nov 2023 09:05:19 -0500 Subject: [PATCH 03/17] Normalizing offsets iterator (#14234) Creates a normalizing offsets iterator that returns an int64 value given either a int32 or int64 column data. Depends on #14206 Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Divye Gala (https://github.com/divyegala) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/14234 --- .../cudf/column/column_device_view.cuh | 8 +- cpp/include/cudf/detail/indexalator.cuh | 151 ++++++++++++++-- .../cudf/detail/normalizing_iterator.cuh | 160 +---------------- cpp/include/cudf/detail/offsets_iterator.cuh | 165 ++++++++++++++++++ .../cudf/detail/offsets_iterator_factory.cuh | 47 +++++ cpp/tests/CMakeLists.txt | 1 + cpp/tests/iterator/indexalator_test.cu | 37 ---- cpp/tests/iterator/offsetalator_test.cu | 140 +++++++++++++++ 8 files changed, 502 insertions(+), 207 deletions(-) create mode 100644 cpp/include/cudf/detail/offsets_iterator.cuh create mode 100644 cpp/include/cudf/detail/offsets_iterator_factory.cuh create mode 100644 cpp/tests/iterator/offsetalator_test.cu diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh index 35851a99822..b1ff0bbaea7 100644 --- a/cpp/include/cudf/column/column_device_view.cuh +++ b/cpp/include/cudf/column/column_device_view.cuh @@ -16,6 +16,7 @@ #pragma once #include +#include #include #include #include @@ -442,10 +443,11 @@ class alignas(16) column_device_view : public detail::column_device_view_base { __device__ T element(size_type element_index) const noexcept { size_type index = element_index + offset(); // account for this view's _offset - auto const* d_offsets = d_children[strings_column_view::offsets_column_index].data(); char const* d_strings = d_children[strings_column_view::chars_column_index].data(); - size_type offset = d_offsets[index]; - return string_view{d_strings + offset, d_offsets[index + 1] - offset}; + auto const offsets = d_children[strings_column_view::offsets_column_index]; + auto const itr = cudf::detail::input_offsetalator(offsets.head(), offsets.type()); + auto const offset = itr[index]; + return string_view{d_strings + offset, static_cast(itr[index + 1] - offset)}; } private: diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh index 6532dae3695..4d261c54b29 100644 --- a/cpp/include/cudf/detail/indexalator.cuh +++ b/cpp/include/cudf/detail/indexalator.cuh @@ -56,10 +56,69 @@ namespace detail { * auto result = thrust::find(thrust::device, begin, end, size_type{12} ); * @endcode */ -using input_indexalator = input_normalator; +struct input_indexalator : base_normalator { + friend struct base_normalator; // for CRTP + + using reference = cudf::size_type const; // this keeps STL and thrust happy + + input_indexalator() = default; + input_indexalator(input_indexalator const&) = default; + input_indexalator(input_indexalator&&) = default; + input_indexalator& operator=(input_indexalator const&) = default; + input_indexalator& operator=(input_indexalator&&) = default; + + /** + * @brief Indirection operator returns the value at the current iterator position + */ + __device__ inline cudf::size_type operator*() const { return operator[](0); } + + /** + * @brief Dispatch functor for resolving a Integer value from any integer type + */ + struct normalize_type { + template ())> + __device__ cudf::size_type operator()(void const* tp) + { + return static_cast(*static_cast(tp)); + } + template ())> + __device__ cudf::size_type operator()(void const*) + { + CUDF_UNREACHABLE("only integral types are supported"); + } + }; + + /** + * @brief Array subscript operator returns a value at the input + * `idx` position as a `Integer` value. + */ + __device__ inline cudf::size_type operator[](size_type idx) const + { + void const* tp = p_ + (idx * this->width_); + return type_dispatcher(this->dtype_, normalize_type{}, tp); + } + + /** + * @brief Create an input index normalizing iterator + * + * Use the indexalator_factory to create an iterator instance. + * + * @param data Pointer to an integer array in device memory. + * @param dtype Type of data in data + * @param offset Applied to the data pointer per size of the type + */ + CUDF_HOST_DEVICE input_indexalator(void const* data, data_type dtype, cudf::size_type offset = 0) + : base_normalator(dtype), p_{static_cast(data)} + { + p_ += offset * this->width_; + } + + protected: + char const* p_; /// pointer to the integer data in device memory +}; /** - * @brief The index normalizing output iterator. + * @brief The index normalizing output iterator * * This is an iterator that can be used for index types (integers) without * requiring a type-specific instance. It can be used for any iterator @@ -82,7 +141,75 @@ using input_indexalator = input_normalator; * thrust::less()); * @endcode */ -using output_indexalator = output_normalator; +struct output_indexalator : base_normalator { + friend struct base_normalator; // for CRTP + + using reference = output_indexalator const&; // required for output iterators + + output_indexalator() = default; + output_indexalator(output_indexalator const&) = default; + output_indexalator(output_indexalator&&) = default; + output_indexalator& operator=(output_indexalator const&) = default; + output_indexalator& operator=(output_indexalator&&) = default; + + /** + * @brief Indirection operator returns this iterator instance in order + * to capture the `operator=(Integer)` calls. + */ + __device__ inline reference operator*() const { return *this; } + + /** + * @brief Array subscript operator returns an iterator instance at the specified `idx` position. + * + * This allows capturing the subsequent `operator=(Integer)` call in this class. + */ + __device__ inline output_indexalator const operator[](size_type idx) const + { + output_indexalator tmp{*this}; + tmp.p_ += (idx * this->width_); + return tmp; + } + + /** + * @brief Dispatch functor for setting the index value from a size_type value. + */ + struct normalize_type { + template ())> + __device__ void operator()(void* tp, cudf::size_type const value) + { + (*static_cast(tp)) = static_cast(value); + } + template ())> + __device__ void operator()(void*, cudf::size_type const) + { + CUDF_UNREACHABLE("only index types are supported"); + } + }; + + /** + * @brief Assign an Integer value to the current iterator position + */ + __device__ inline reference operator=(cudf::size_type const value) const + { + void* tp = p_; + type_dispatcher(this->dtype_, normalize_type{}, tp, value); + return *this; + } + + /** + * @brief Create an output normalizing iterator + * + * @param data Pointer to an integer array in device memory. + * @param dtype Type of data in data + */ + CUDF_HOST_DEVICE output_indexalator(void* data, data_type dtype) + : base_normalator(dtype), p_{static_cast(data)} + { + } + + protected: + char* p_; /// pointer to the integer data in device memory +}; /** * @brief Use this class to create an indexalator instance. @@ -92,14 +219,12 @@ struct indexalator_factory { * @brief A type_dispatcher functor to create an input iterator from an indices column. */ struct input_indexalator_fn { - template ()>* = nullptr> + template ())> input_indexalator operator()(column_view const& indices) { return input_indexalator(indices.data(), indices.type()); } - template ()>* = nullptr> + template ())> input_indexalator operator()(Args&&... args) { CUDF_FAIL("indices must be an index type"); @@ -110,16 +235,14 @@ struct indexalator_factory { * @brief Use this class to create an indexalator to a scalar index. */ struct input_indexalator_scalar_fn { - template ()>* = nullptr> + template ())> input_indexalator operator()(scalar const& index) { // note: using static_cast const&>(index) creates a copy auto const scalar_impl = static_cast const*>(&index); return input_indexalator(scalar_impl->data(), index.type()); } - template ()>* = nullptr> + template ())> input_indexalator operator()(Args&&... args) { CUDF_FAIL("scalar must be an index type"); @@ -130,14 +253,12 @@ struct indexalator_factory { * @brief A type_dispatcher functor to create an output iterator from an indices column. */ struct output_indexalator_fn { - template ()>* = nullptr> + template ())> output_indexalator operator()(mutable_column_view const& indices) { return output_indexalator(indices.data(), indices.type()); } - template ()>* = nullptr> + template ())> output_indexalator operator()(Args&&... args) { CUDF_FAIL("indices must be an index type"); diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh index 35a695d47df..8f90afc3e57 100644 --- a/cpp/include/cudf/detail/normalizing_iterator.cuh +++ b/cpp/include/cudf/detail/normalizing_iterator.cuh @@ -33,7 +33,7 @@ namespace detail { * @tparam Integer The type the iterator normalizes to */ template -struct base_normalator { +struct alignas(16) base_normalator { static_assert(cudf::is_index_type()); using difference_type = std::ptrdiff_t; using value_type = Integer; @@ -204,7 +204,7 @@ struct base_normalator { private: struct integer_sizeof_fn { - template ()>* = nullptr> + template ())> CUDF_HOST_DEVICE constexpr std::size_t operator()() const { #ifndef __CUDA_ARCH__ @@ -213,7 +213,7 @@ struct base_normalator { CUDF_UNREACHABLE("only integral types are supported"); #endif } - template ()>* = nullptr> + template ())> CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept { return sizeof(T); @@ -229,160 +229,16 @@ struct base_normalator { width_ = static_cast(type_dispatcher(dtype, integer_sizeof_fn{})); } - int32_t width_; /// integer type width = 1,2,4, or 8 - data_type dtype_; /// for type-dispatcher calls -}; - -/** - * @brief The integer normalizing input iterator - * - * This is an iterator that can be used for index types (integers) without - * requiring a type-specific instance. It can be used for any iterator - * interface for reading an array of integer values of type - * int8, int16, int32, int64, uint8, uint16, uint32, or uint64. - * Reading specific elements always return a type of `Integer` - * - * @tparam Integer Type returned by all read functions - */ -template -struct input_normalator : base_normalator, Integer> { - friend struct base_normalator, Integer>; // for CRTP - - using reference = Integer const; // this keeps STL and thrust happy - - input_normalator() = default; - input_normalator(input_normalator const&) = default; - input_normalator(input_normalator&&) = default; - input_normalator& operator=(input_normalator const&) = default; - input_normalator& operator=(input_normalator&&) = default; - - /** - * @brief Indirection operator returns the value at the current iterator position - */ - __device__ inline Integer operator*() const { return operator[](0); } - - /** - * @brief Dispatch functor for resolving a Integer value from any integer type - */ - struct normalize_type { - template ()>* = nullptr> - __device__ Integer operator()(void const* tp) - { - return static_cast(*static_cast(tp)); - } - template ()>* = nullptr> - __device__ Integer operator()(void const*) - { - CUDF_UNREACHABLE("only integral types are supported"); - } - }; - /** - * @brief Array subscript operator returns a value at the input - * `idx` position as a `Integer` value. - */ - __device__ inline Integer operator[](size_type idx) const - { - void const* tp = p_ + (idx * this->width_); - return type_dispatcher(this->dtype_, normalize_type{}, tp); - } - - /** - * @brief Create an input index normalizing iterator. - * - * Use the indexalator_factory to create an iterator instance. - * - * @param data Pointer to an integer array in device memory. - * @param data_type Type of data in data - */ - CUDF_HOST_DEVICE input_normalator(void const* data, data_type dtype, cudf::size_type offset = 0) - : base_normalator, Integer>(dtype), p_{static_cast(data)} - { - p_ += offset * this->width_; - } - - char const* p_; /// pointer to the integer data in device memory -}; - -/** - * @brief The integer normalizing output iterator - * - * This is an iterator that can be used for index types (integers) without - * requiring a type-specific instance. It can be used for any iterator - * interface for writing an array of integer values of type - * int8, int16, int32, int64, uint8, uint16, uint32, or uint64. - * Setting specific elements always accept the `Integer` type values. - * - * @tparam Integer The type used for all write functions - */ -template -struct output_normalator : base_normalator, Integer> { - friend struct base_normalator, Integer>; // for CRTP - - using reference = output_normalator const&; // required for output iterators - - output_normalator() = default; - output_normalator(output_normalator const&) = default; - output_normalator(output_normalator&&) = default; - output_normalator& operator=(output_normalator const&) = default; - output_normalator& operator=(output_normalator&&) = default; - - /** - * @brief Indirection operator returns this iterator instance in order - * to capture the `operator=(Integer)` calls. - */ - __device__ inline output_normalator const& operator*() const { return *this; } - - /** - * @brief Array subscript operator returns an iterator instance at the specified `idx` position. - * - * This allows capturing the subsequent `operator=(Integer)` call in this class. - */ - __device__ inline output_normalator const operator[](size_type idx) const - { - output_normalator tmp{*this}; - tmp.p_ += (idx * this->width_); - return tmp; - } - - /** - * @brief Dispatch functor for setting the index value from a size_type value. - */ - struct normalize_type { - template ()>* = nullptr> - __device__ void operator()(void* tp, Integer const value) - { - (*static_cast(tp)) = static_cast(value); - } - template ()>* = nullptr> - __device__ void operator()(void*, Integer const) - { - CUDF_UNREACHABLE("only index types are supported"); - } - }; - - /** - * @brief Assign an Integer value to the current iterator position - */ - __device__ inline output_normalator const& operator=(Integer const value) const - { - void* tp = p_; - type_dispatcher(this->dtype_, normalize_type{}, tp, value); - return *this; - } - - /** - * @brief Create an output normalizing iterator - * - * @param data Pointer to an integer array in device memory. - * @param data_type Type of data in data + * @brief Constructor assigns width and type member variables for base class. */ - CUDF_HOST_DEVICE output_normalator(void* data, data_type dtype) - : base_normalator, Integer>(dtype), p_{static_cast(data)} + explicit CUDF_HOST_DEVICE base_normalator(data_type dtype, int32_t width) + : width_(width), dtype_(dtype) { } - char* p_; /// pointer to the integer data in device memory + int32_t width_; /// integer type width = 1,2,4, or 8 + data_type dtype_; /// for type-dispatcher calls }; } // namespace detail diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh new file mode 100644 index 00000000000..3eb77b32353 --- /dev/null +++ b/cpp/include/cudf/detail/offsets_iterator.cuh @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cudf { +namespace detail { + +/** + * @brief The offsets normalizing input iterator + * + * This is an iterator that can be used for offsets where the underlying + * type may be int32_t or int64_t. + * + * Use the offsetalator_factory to create an appropriate input iterator + * from an offsets column_view. + */ +struct input_offsetalator : base_normalator { + friend struct base_normalator; // for CRTP + + using reference = int64_t const; // this keeps STL and thrust happy + + input_offsetalator() = default; + input_offsetalator(input_offsetalator const&) = default; + input_offsetalator(input_offsetalator&&) = default; + input_offsetalator& operator=(input_offsetalator const&) = default; + input_offsetalator& operator=(input_offsetalator&&) = default; + + /** + * @brief Indirection operator returns the value at the current iterator position + */ + __device__ inline int64_t operator*() const { return operator[](0); } + + /** + * @brief Array subscript operator returns a value at the input + * `idx` position as a int64_t value. + */ + __device__ inline int64_t operator[](size_type idx) const + { + void const* tp = p_ + (idx * this->width_); + return this->width_ == sizeof(int32_t) ? static_cast(*static_cast(tp)) + : *static_cast(tp); + } + + /** + * @brief Create an input index normalizing iterator. + * + * Use the indexalator_factory to create an iterator instance. + * + * @param data Pointer to an integer array in device memory. + * @param dtype Type of data in data + */ + CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype) + : base_normalator( + dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)), + p_{static_cast(data)} + { +#ifndef __CUDA_ARCH__ + CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64, + "Unexpected offsets type"); +#else + cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) && + "Unexpected offsets type"); +#endif + } + + protected: + char const* p_; /// pointer to the integer data in device memory +}; + +/** + * @brief The offsets normalizing output iterator + * + * This is an iterator that can be used for storing offsets values + * where the underlying type may be either int32_t or int64_t. + * + * Use the offsetalator_factory to create an appropriate output iterator + * from a mutable_column_view. + * + */ +struct output_offsetalator : base_normalator { + friend struct base_normalator; // for CRTP + + using reference = output_offsetalator const&; // required for output iterators + + output_offsetalator() = default; + output_offsetalator(output_offsetalator const&) = default; + output_offsetalator(output_offsetalator&&) = default; + output_offsetalator& operator=(output_offsetalator const&) = default; + output_offsetalator& operator=(output_offsetalator&&) = default; + + /** + * @brief Indirection operator returns this iterator instance in order + * to capture the `operator=(int64)` calls. + */ + __device__ inline output_offsetalator const& operator*() const { return *this; } + + /** + * @brief Array subscript operator returns an iterator instance at the specified `idx` position. + * + * This allows capturing the subsequent `operator=(int64)` call in this class. + */ + __device__ inline output_offsetalator const operator[](size_type idx) const + { + output_offsetalator tmp{*this}; + tmp.p_ += (idx * this->width_); + return tmp; + } + + /** + * @brief Assign an offset value to the current iterator position + */ + __device__ inline output_offsetalator const& operator=(int64_t const value) const + { + void* tp = p_; + if (this->width_ == sizeof(int32_t)) { + (*static_cast(tp)) = static_cast(value); + } else { + (*static_cast(tp)) = value; + } + return *this; + } + + /** + * @brief Create an output offsets iterator + * + * @param data Pointer to an integer array in device memory. + * @param dtype Type of data in data + */ + CUDF_HOST_DEVICE output_offsetalator(void* data, data_type dtype) + : base_normalator( + dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)), + p_{static_cast(data)} + { +#ifndef __CUDA_ARCH__ + CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64, + "Unexpected offsets type"); +#else + cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) && + "Unexpected offsets type"); +#endif + } + + protected: + char* p_; /// pointer to the integer data in device memory +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/include/cudf/detail/offsets_iterator_factory.cuh b/cpp/include/cudf/detail/offsets_iterator_factory.cuh new file mode 100644 index 00000000000..5b4c6b825d2 --- /dev/null +++ b/cpp/include/cudf/detail/offsets_iterator_factory.cuh @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include + +namespace cudf { +namespace detail { + +/** + * @brief Use this class to create an offsetalator instance. + */ +struct offsetalator_factory { + /** + * @brief Create an input offsetalator instance from an offsets column + */ + static input_offsetalator make_input_iterator(column_view const& offsets) + { + return input_offsetalator(offsets.head(), offsets.type()); + } + + /** + * @brief Create an output offsetalator instance from an offsets column + */ + static output_offsetalator make_output_iterator(mutable_column_view const& offsets) + { + return output_offsetalator(offsets.head(), offsets.type()); + } +}; + +} // namespace detail +} // namespace cudf diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index b0382d15807..7b628649051 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -393,6 +393,7 @@ set_tests_properties( ConfigureTest( ITERATOR_TEST iterator/indexalator_test.cu + iterator/offsetalator_test.cu iterator/optional_iterator_test_chrono.cu iterator/optional_iterator_test_numeric.cu iterator/pair_iterator_test_chrono.cu diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu index 3e8bcd5cb0d..0c10853ec02 100644 --- a/cpp/tests/iterator/indexalator_test.cu +++ b/cpp/tests/iterator/indexalator_test.cu @@ -157,40 +157,3 @@ TYPED_TEST(IndexalatorTest, output_iterator) expected = cudf::test::fixed_width_column_wrapper({0, 1, 1, 2, 3, 4, 5, 5, 7}); CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); } - -/** - * For testing creating and using the indexalator in device code. - */ -struct device_functor_fn { - cudf::column_device_view const d_col; - __device__ cudf::size_type operator()(cudf::size_type idx) - { - auto itr = cudf::detail::input_indexalator(d_col.head(), d_col.type()); - return itr[idx] * 3; - } -}; - -TYPED_TEST(IndexalatorTest, device_indexalator) -{ - using T = TypeParam; - - auto d_col1 = - cudf::test::fixed_width_column_wrapper({0, 6, 7, 14, 23, 33, 43, 45, 63}); - auto d_col2 = - cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 0, 0, 0}); - auto input = cudf::column_view(d_col1); - auto output = cudf::mutable_column_view(d_col2); - auto stream = cudf::get_default_stream(); - - auto d_input = cudf::column_device_view::create(input, stream); - - thrust::transform(rmm::exec_policy(stream), - thrust::counting_iterator(0), - thrust::counting_iterator(input.size()), - output.begin(), - device_functor_fn{*d_input}); - - auto expected = - cudf::test::fixed_width_column_wrapper({0, 18, 21, 42, 69, 99, 129, 135, 189}); - CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); -} diff --git a/cpp/tests/iterator/offsetalator_test.cu b/cpp/tests/iterator/offsetalator_test.cu new file mode 100644 index 00000000000..e569e58f42a --- /dev/null +++ b/cpp/tests/iterator/offsetalator_test.cu @@ -0,0 +1,140 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software distributed under the License + * is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express + * or implied. See the License for the specific language governing permissions and limitations under + * the License. + */ + +#include + +#include +#include + +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +using TestingTypes = cudf::test::Types; + +template +struct OffsetalatorTest : public IteratorTest {}; + +TYPED_TEST_SUITE(OffsetalatorTest, TestingTypes); + +TYPED_TEST(OffsetalatorTest, input_iterator) +{ + using T = TypeParam; + + auto host_values = cudf::test::make_type_param_vector({0, 6, 0, -14, 13, 64, -13, -20, 45}); + + auto d_col = cudf::test::fixed_width_column_wrapper(host_values.begin(), host_values.end()); + + auto expected_values = thrust::host_vector(host_values.size()); + std::transform(host_values.begin(), host_values.end(), expected_values.begin(), [](auto v) { + return static_cast(v); + }); + + auto it_dev = cudf::detail::offsetalator_factory::make_input_iterator(d_col); + this->iterator_test_thrust(expected_values, it_dev, host_values.size()); +} + +TYPED_TEST(OffsetalatorTest, output_iterator) +{ + using T = TypeParam; + + auto d_col1 = cudf::test::fixed_width_column_wrapper({0, 6, 7, 14, 23, 33, 43, 45, 63}); + auto d_col2 = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 0, 0, 0}); + auto itr = cudf::detail::offsetalator_factory::make_output_iterator(d_col2); + auto input = cudf::column_view(d_col1); + auto stream = cudf::get_default_stream(); + + auto map = cudf::test::fixed_width_column_wrapper({0, 2, 4, 6, 8, 1, 3, 5, 7}); + auto d_map = cudf::column_view(map); + thrust::gather(rmm::exec_policy_nosync(stream), + d_map.begin(), + d_map.end(), + input.begin(), + itr); + auto expected = cudf::test::fixed_width_column_wrapper({0, 7, 23, 43, 63, 6, 14, 33, 45}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + thrust::scatter(rmm::exec_policy_nosync(stream), + input.begin(), + input.end(), + d_map.begin(), + itr); + expected = cudf::test::fixed_width_column_wrapper({0, 33, 6, 43, 7, 45, 14, 63, 23}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + thrust::fill(rmm::exec_policy(stream), itr, itr + input.size(), 77); + expected = cudf::test::fixed_width_column_wrapper({77, 77, 77, 77, 77, 77, 77, 77, 77}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + thrust::sequence(rmm::exec_policy(stream), itr, itr + input.size()); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 2, 3, 4, 5, 6, 7, 8}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); + + auto offsets = + cudf::test::fixed_width_column_wrapper({0, 10, 20, 30, 40, 50, 60, 70, 80}); + auto d_offsets = cudf::column_view(offsets); + thrust::lower_bound(rmm::exec_policy(stream), + d_offsets.begin(), + d_offsets.end(), + input.begin(), + input.end(), + itr); + expected = cudf::test::fixed_width_column_wrapper({0, 1, 1, 2, 3, 4, 5, 5, 7}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); +} + +namespace { +/** + * For testing creating and using the offsetalator in device code. + */ +struct device_functor_fn { + cudf::column_device_view const d_col; + __device__ int32_t operator()(int idx) + { + auto const itr = cudf::detail::input_offsetalator(d_col.head(), d_col.type()); + return static_cast(itr[idx] * 3); + } +}; +} // namespace + +TYPED_TEST(OffsetalatorTest, device_offsetalator) +{ + using T = TypeParam; + + auto d_col1 = cudf::test::fixed_width_column_wrapper({0, 6, 7, 14, 23, 33, 43, 45, 63}); + auto d_col2 = cudf::test::fixed_width_column_wrapper({0, 0, 0, 0, 0, 0, 0, 0, 0}); + auto input = cudf::column_view(d_col1); + auto output = cudf::mutable_column_view(d_col2); + auto stream = cudf::get_default_stream(); + + auto d_input = cudf::column_device_view::create(input, stream); + + thrust::transform(rmm::exec_policy(stream), + thrust::counting_iterator(0), + thrust::counting_iterator(input.size()), + output.begin(), + device_functor_fn{*d_input}); + + auto expected = + cudf::test::fixed_width_column_wrapper({0, 18, 21, 42, 69, 99, 129, 135, 189}); + CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected); +} From 4313cfa9b3fcff41f67b48ac8797dc015d441ecc Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Mon, 13 Nov 2023 11:40:36 -0800 Subject: [PATCH 04/17] Use new rapids-dask-dependency metapackage for managing dask versions (#14364) * Update dependency lists * Update wheel building to stop needing manual installations * Update wheel dependency with alpha spec * Rename the package * Update update-version.sh * Update conda/recipes/dask-cudf/meta.yaml Co-authored-by: GALI PREM SAGAR * Make pip/conda dependencies consistent and fix recipe * dfg * Apply suggestions from code review --------- Co-authored-by: GALI PREM SAGAR --- ci/build_wheel.sh | 2 ++ ci/release/update-version.sh | 1 + ci/test_wheel_dask_cudf.sh | 3 --- conda/environments/all_cuda-118_arch-x86_64.yaml | 4 +--- conda/environments/all_cuda-120_arch-x86_64.yaml | 4 +--- conda/recipes/dask-cudf/meta.yaml | 8 +------- dependencies.yaml | 4 +--- python/dask_cudf/pyproject.toml | 3 +-- 8 files changed, 8 insertions(+), 21 deletions(-) diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh index 08716cdb3d9..ae1d9c3fb1a 100755 --- a/ci/build_wheel.sh +++ b/ci/build_wheel.sh @@ -36,6 +36,8 @@ fi if [[ ${package_name} == "dask_cudf" ]]; then sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file} + sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file} + sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file} else sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file} # ptxcompiler and cubinlinker aren't version constrained diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 7574b4174e9..843abd3c3c1 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -80,6 +80,7 @@ DEPENDENCIES=( kvikio libkvikio librmm + rapids-dask-dependency rmm ) for DEP in "${DEPENDENCIES[@]}"; do diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh index 118bea753d0..e9162b816aa 100755 --- a/ci/test_wheel_dask_cudf.sh +++ b/ci/test_wheel_dask_cudf.sh @@ -23,9 +23,6 @@ manylinux="manylinux_${manylinux_version}" RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep python -m pip install --no-deps ./local-cudf-dep/cudf*.whl -# Always install latest dask for testing -python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.12 - # echo to expand wildcard before adding `[extra]` requires for pip python -m pip install $(echo ./dist/dask_cudf*.whl)[test] diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml index adf4fcad32d..9b85888a7b3 100644 --- a/conda/environments/all_cuda-118_arch-x86_64.yaml +++ b/conda/environments/all_cuda-118_arch-x86_64.yaml @@ -25,10 +25,7 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-core>=2023.9.2 - dask-cuda==23.12.* -- dask>=2023.9.2 -- distributed>=2023.9.2 - dlpack>=0.5,<0.6.0a0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -80,6 +77,7 @@ dependencies: - python-snappy>=0.6.0 - python>=3.9,<3.11 - pytorch<1.12.0 +- rapids-dask-dependency==23.12.* - rich - rmm==23.12.* - s3fs>=2022.3.0 diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index a69ef587570..da2b4e109b3 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -27,10 +27,7 @@ dependencies: - cupy>=12.0.0 - cxx-compiler - cython>=3.0.3 -- dask-core>=2023.9.2 - dask-cuda==23.12.* -- dask>=2023.9.2 -- distributed>=2023.9.2 - dlpack>=0.5,<0.6.0a0 - doxygen=1.9.1 - fastavro>=0.22.9 @@ -78,6 +75,7 @@ dependencies: - python-snappy>=0.6.0 - python>=3.9,<3.11 - pytorch<1.12.0 +- rapids-dask-dependency==23.12.* - rich - rmm==23.12.* - s3fs>=2022.3.0 diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml index 9dc9f76d9f5..16638926492 100644 --- a/conda/recipes/dask-cudf/meta.yaml +++ b/conda/recipes/dask-cudf/meta.yaml @@ -37,17 +37,11 @@ build: requirements: host: - python - - cudf ={{ version }} - - dask >=2023.9.2 - - dask-core >=2023.9.2 - - distributed >=2023.9.2 - cuda-version ={{ cuda_version }} run: - python - cudf ={{ version }} - - dask >=2023.9.2 - - dask-core >=2023.9.2 - - distributed >=2023.9.2 + - rapids-dask-dependency ={{ minor_version }} - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} test: diff --git a/dependencies.yaml b/dependencies.yaml index a16b51f4483..b971a682571 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -500,12 +500,10 @@ dependencies: common: - output_types: [conda, requirements, pyproject] packages: - - dask>=2023.9.2 - - distributed>=2023.9.2 + - rapids-dask-dependency==23.12.* - output_types: conda packages: - cupy>=12.0.0 - - dask-core>=2023.9.2 # dask-core in conda is the actual package & dask is the meta package - output_types: pyproject packages: - &cudf cudf==23.12.* diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml index 32c7bb9fd15..0306da3de46 100644 --- a/python/dask_cudf/pyproject.toml +++ b/python/dask_cudf/pyproject.toml @@ -20,11 +20,10 @@ requires-python = ">=3.9" dependencies = [ "cudf==23.12.*", "cupy-cuda11x>=12.0.0", - "dask>=2023.9.2", - "distributed>=2023.9.2", "fsspec>=0.6.0", "numpy>=1.21,<1.25", "pandas>=1.3,<1.6.0dev0", + "rapids-dask-dependency==23.12.*", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. classifiers = [ "Intended Audience :: Developers", From 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 14 Nov 2023 00:51:42 -0500 Subject: [PATCH 05/17] Always build nvbench statically so we don't need to package it (#14399) Corrects failures seen in C++ CI where libnvbench.so can't be found Authors: - Robert Maynard (https://github.com/robertmaynard) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14399 --- cpp/cmake/thirdparty/get_nvbench.cmake | 2 +- cpp/cmake/thirdparty/patches/nvbench_override.json | 5 ----- 2 files changed, 1 insertion(+), 6 deletions(-) diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake index f0642145fa0..bbd22693ba4 100644 --- a/cpp/cmake/thirdparty/get_nvbench.cmake +++ b/cpp/cmake/thirdparty/get_nvbench.cmake @@ -21,7 +21,7 @@ function(find_and_configure_nvbench) set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches") rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json") - rapids_cpm_nvbench() + rapids_cpm_nvbench(BUILD_STATIC) endfunction() diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json index 7be868081b6..ad9b19c29c1 100644 --- a/cpp/cmake/thirdparty/patches/nvbench_override.json +++ b/cpp/cmake/thirdparty/patches/nvbench_override.json @@ -7,11 +7,6 @@ "file" : "${current_json_dir}/nvbench_global_setup.diff", "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]", "fixed_in" : "" - }, - { - "file" : "nvbench/use_existing_fmt.diff", - "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]", - "fixed_in" : "" } ] } From e982d3736f095e680298af85bde732d9b5a73122 Mon Sep 17 00:00:00 2001 From: Ashwin Srinath <3190405+shwina@users.noreply.github.com> Date: Tue, 14 Nov 2023 09:51:02 -0500 Subject: [PATCH 06/17] cudf.pandas: cuDF subpath checking in module `__getattr__` (#14388) Closes https://github.com/rapidsai/cudf/issues/14384. `x.startswith(y)` is not a good enough check for if `x` is a subdirectory of `y`. It causes `pandasai` to be reported as a sub-package of `pandas`. Authors: - Ashwin Srinath (https://github.com/shwina) Approvers: - https://github.com/brandon-b-miller URL: https://github.com/rapidsai/cudf/pull/14388 --- python/cudf/cudf/pandas/module_accelerator.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py index eb35c4adaaf..180d75d96e8 100644 --- a/python/cudf/cudf/pandas/module_accelerator.py +++ b/python/cudf/cudf/pandas/module_accelerator.py @@ -10,6 +10,7 @@ import importlib.abc import importlib.machinery import os +import pathlib import sys import threading import warnings @@ -554,9 +555,10 @@ def getattr_real_or_wrapped( frame = sys._getframe() # We cannot possibly be at the top level. assert frame.f_back - calling_module = frame.f_back.f_code.co_filename + calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename) use_real = any( - calling_module.startswith(path) for path in loader._denylist + calling_module.is_relative_to(path) + for path in loader._denylist ) try: if use_real: From 7f3fba164c4dd28c701ea2941d0525fc782a639c Mon Sep 17 00:00:00 2001 From: Jeremy Dyer Date: Tue, 14 Nov 2023 12:02:10 -0500 Subject: [PATCH 07/17] Refactor cudf_kafka to use skbuild (#14292) Refactor the currently outdated cudf_kafka build setup to use skbuild instead. Authors: - Jeremy Dyer (https://github.com/jdye64) - Bradley Dice (https://github.com/bdice) - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Bradley Dice (https://github.com/bdice) - AJ Schmidt (https://github.com/ajschmidt8) URL: https://github.com/rapidsai/cudf/pull/14292 --- build.sh | 2 +- ci/release/update-version.sh | 1 + .../all_cuda-120_arch-x86_64.yaml | 1 - conda/recipes/cudf_kafka/build.sh | 13 --- .../cudf_kafka/conda_build_config.yaml | 6 ++ conda/recipes/cudf_kafka/meta.yaml | 21 ++-- cpp/libcudf_kafka/CMakeLists.txt | 8 +- .../cmake/thirdparty/get_cudf.cmake | 16 +-- cpp/libcudf_kafka/tests/CMakeLists.txt | 2 +- dependencies.yaml | 13 +-- python/cudf/cudf/_lib/CMakeLists.txt | 6 -- python/cudf_kafka/CMakeLists.txt | 47 +++++++++ python/cudf_kafka/LICENSE | 1 + python/cudf_kafka/README.md | 1 + .../cudf_kafka/cudf_kafka/_lib/CMakeLists.txt | 62 ++++++++++++ python/cudf_kafka/cudf_kafka/_lib/kafka.pxd | 4 +- python/cudf_kafka/pyproject.toml | 1 + python/cudf_kafka/setup.py | 97 ++----------------- 18 files changed, 160 insertions(+), 142 deletions(-) create mode 100644 python/cudf_kafka/CMakeLists.txt create mode 120000 python/cudf_kafka/LICENSE create mode 120000 python/cudf_kafka/README.md create mode 100644 python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt diff --git a/build.sh b/build.sh index 2ad69712e5d..e5beb51dedf 100755 --- a/build.sh +++ b/build.sh @@ -369,7 +369,7 @@ fi # build cudf_kafka Python package if hasArg cudf_kafka; then cd ${REPODIR}/python/cudf_kafka - SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \ + SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS}" \ SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \ python -m pip install --no-build-isolation --no-deps . fi diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh index 843abd3c3c1..4f1cbc47d1d 100755 --- a/ci/release/update-version.sh +++ b/ci/release/update-version.sh @@ -43,6 +43,7 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' # Python CMakeLists updates sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt +sed_runner 's/'"cudf_kafka_version .*)"'/'"cudf_kafka_version ${NEXT_FULL_TAG})"'/g' python/cudf_kafka/CMakeLists.txt # cpp libcudf_kafka update sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml index da2b4e109b3..a3eeb3dd99f 100644 --- a/conda/environments/all_cuda-120_arch-x86_64.yaml +++ b/conda/environments/all_cuda-120_arch-x86_64.yaml @@ -17,7 +17,6 @@ dependencies: - cmake>=3.26.4 - cramjam - cuda-cudart-dev -- cuda-gdb - cuda-nvcc - cuda-nvrtc-dev - cuda-nvtx-dev diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh index f4bb6e1bc91..9458349d101 100644 --- a/conda/recipes/cudf_kafka/build.sh +++ b/conda/recipes/cudf_kafka/build.sh @@ -1,16 +1,3 @@ # Copyright (c) 2020-2023, NVIDIA CORPORATION. -# This assumes the script is executed from the root of the repo directory -# Need to set CUDA_HOME inside conda environments because the hacked together -# setup.py for cudf-kafka searches that way. -# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates -# cudf_kafka to use scikit-build -CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*} -if [[ ${CUDA_MAJOR} == "12" ]]; then - target_name="x86_64-linux" - if [[ ! $(arch) == "x86_64" ]]; then - target_name="sbsa-linux" - fi - export CUDA_HOME="${PREFIX}/targets/${target_name}/" -fi ./build.sh -v cudf_kafka diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml index b63a136ad2d..c98c2701653 100644 --- a/conda/recipes/cudf_kafka/conda_build_config.yaml +++ b/conda/recipes/cudf_kafka/conda_build_config.yaml @@ -9,3 +9,9 @@ sysroot_version: cmake_version: - ">=3.26.4" + +cuda_compiler: + - cuda-nvcc + +cuda11_compiler: + - nvcc diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml index 9440f8bf124..343ec2519f1 100644 --- a/conda/recipes/cudf_kafka/meta.yaml +++ b/conda/recipes/cudf_kafka/meta.yaml @@ -33,28 +33,31 @@ build: - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64] - SCCACHE_S3_USE_SSL - SCCACHE_S3_NO_CREDENTIALS - # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates - # cudf_kafka to use scikit-build - - RAPIDS_CUDA_VERSION + ignore_run_exports_from: + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} + {% endif %} requirements: build: - cmake {{ cmake_version }} + - ninja - {{ compiler('c') }} - {{ compiler('cxx') }} - - ninja - - sysroot_{{ target_platform }} {{ sysroot_version }} - # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates - # cudf_kafka to use scikit-build - {% if cuda_major == "12" %} - - cuda-gdb + {% if cuda_major == "11" %} + - {{ compiler('cuda11') }} ={{ cuda_version }} + {% else %} + - {{ compiler('cuda') }} {% endif %} + - cuda-version ={{ cuda_version }} + - sysroot_{{ target_platform }} {{ sysroot_version }} host: - python - cython >=3.0.3 - cuda-version ={{ cuda_version }} - cudf ={{ version }} - libcudf_kafka ={{ version }} + - scikit-build >=0.13.1 - setuptools {% if cuda_major == "12" %} - cuda-cudart-dev diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt index 1a15a3ec2cd..4128afa3935 100644 --- a/cpp/libcudf_kafka/CMakeLists.txt +++ b/cpp/libcudf_kafka/CMakeLists.txt @@ -21,7 +21,7 @@ include(rapids-export) include(rapids-find) project( - CUDA_KAFKA + CUDF_KAFKA VERSION 23.12.00 LANGUAGES CXX ) @@ -64,7 +64,7 @@ add_library(cudf_kafka SHARED src/kafka_consumer.cpp src/kafka_callback.cpp) # ################################################################################################## # * include paths --------------------------------------------------------------------------------- target_include_directories( - cudf_kafka PUBLIC "$" + cudf_kafka PUBLIC "$" "$" ) @@ -85,6 +85,8 @@ set_target_properties( CXX_STANDARD_REQUIRED ON ) +add_library(cudf_kafka::cudf_kafka ALIAS cudf_kafka) + # ################################################################################################## # * cudf_kafka Install ---------------------------------------------------------------------------- rapids_cmake_install_lib_dir(lib_dir) @@ -94,7 +96,7 @@ install( EXPORT cudf_kafka-exports ) -install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include DESTINATION include) +install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}) rapids_export( INSTALL cudf_kafka diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake index aa4c5b60e7a..20aa9873f43 100644 --- a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake +++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake @@ -1,5 +1,5 @@ # ============================================================================= -# Copyright (c) 2021-2022, NVIDIA CORPORATION. +# Copyright (c) 2021-2023, NVIDIA CORPORATION. # # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except # in compliance with the License. You may obtain a copy of the License at @@ -35,21 +35,21 @@ function(find_and_configure_cudf VERSION) endif() endfunction() -set(CUDA_KAFKA_MIN_VERSION_cudf - "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}" +set(CUDF_KAFKA_MIN_VERSION + "${CUDF_KAFKA_VERSION_MAJOR}.${CUDF_KAFKA_VERSION_MINOR}.${CUDF_KAFKA_VERSION_PATCH}" ) -find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf}) +find_and_configure_cudf(${CUDF_KAFKA_MIN_VERSION}) if(cudf_REQUIRES_CUDA) - rapids_cuda_init_architectures(CUDA_KAFKA) + rapids_cuda_init_architectures(CUDF_KAFKA) # Since we are building cudf as part of ourselves we need to enable the CUDA language in the # top-most scope enable_language(CUDA) - # Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that + # Since CUDF_KAFKA only enables CUDA optionally we need to manually include the file that # rapids_cuda_init_architectures relies on `project` calling - if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE) - include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}") + if(DEFINED CMAKE_PROJECT_CUDF_KAFKA_INCLUDE) + include("${CMAKE_PROJECT_CUDF_KAFKA_INCLUDE}") endif() endif() diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt index 68a5327b455..b819cb6fc3b 100644 --- a/cpp/libcudf_kafka/tests/CMakeLists.txt +++ b/cpp/libcudf_kafka/tests/CMakeLists.txt @@ -26,7 +26,7 @@ function(ConfigureTest test_name) add_executable(${test_name} ${ARGN}) set_target_properties( ${test_name} - PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" + PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$" INSTALL_RPATH "\$ORIGIN/../../../lib" ) target_link_libraries( diff --git a/dependencies.yaml b/dependencies.yaml index b971a682571..97149a5e2ba 100644 --- a/dependencies.yaml +++ b/dependencies.yaml @@ -9,8 +9,8 @@ files: - build_all - build_cpp - build_wheels - - build_python - build_python_common + - build_python_cudf - cudatoolkit - develop - docs @@ -71,8 +71,8 @@ files: table: build-system includes: - build_all - - build_python - build_python_common + - build_python_cudf - build_wheels py_run_cudf: output: pyproject @@ -138,8 +138,8 @@ files: extras: table: build-system includes: - - build_wheels - build_python_common + - build_wheels py_run_cudf_kafka: output: pyproject pyproject_dir: python/cudf_kafka @@ -259,16 +259,16 @@ dependencies: - cython>=3.0.3 # TODO: Pin to numpy<1.25 until cudf requires pandas 2 - &numpy numpy>=1.21,<1.25 + - scikit-build>=0.13.1 - output_types: [conda, requirements, pyproject] packages: # Hard pin the patch version used during the build. This must be kept # in sync with the version pinned in get_arrow.cmake. - pyarrow==14.0.1.* - build_python: + build_python_cudf: common: - output_types: [conda, requirements, pyproject] packages: - - scikit-build>=0.13.1 - rmm==23.12.* - output_types: conda packages: @@ -302,9 +302,6 @@ dependencies: - cuda-nvrtc-dev - cuda-nvtx-dev - libcurand-dev - # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates - # cudf_kafka to use scikit-build - - cuda-gdb - matrix: cuda: "11.8" packages: diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt index 1b543b94589..c041c7f4842 100644 --- a/python/cudf/cudf/_lib/CMakeLists.txt +++ b/python/cudf/cudf/_lib/CMakeLists.txt @@ -81,12 +81,6 @@ target_link_libraries(strings_udf cudf_strings_udf) # necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C # /opt/_internal" find_package(NumPy REQUIRED) -set(targets_using_numpy interop avro csv orc json parquet) -foreach(target IN LISTS targets_using_numpy) - target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") - # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24. - # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}") -endforeach() set(targets_using_dlpack interop) foreach(target IN LISTS targets_using_dlpack) diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt new file mode 100644 index 00000000000..d55c3fdc076 --- /dev/null +++ b/python/cudf_kafka/CMakeLists.txt @@ -0,0 +1,47 @@ +# ============================================================================= +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) + +set(cudf_kafka_version 23.12.00) + +include(../../fetch_rapids.cmake) + +project( + cudf-kafka-python + VERSION ${cudf_kafka_version} + LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C + # language to be enabled here. The test project that is built in scikit-build to verify + # various linking options for the python library is hardcoded to build with C, so until + # that is fixed we need to keep C. + C CXX +) + +find_package(cudf_kafka ${cudf_kafka_version} REQUIRED) + +if(NOT cudf_kafka_FOUND) + message( + FATAL_ERROR + "cudf_kafka package not found. cudf_kafka C++ is required to build this Python package." + ) +endif() + +include(rapids-cython) +rapids_cython_init() + +add_subdirectory(cudf_kafka/_lib) + +if(DEFINED cython_lib_dir) + rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}") +endif() diff --git a/python/cudf_kafka/LICENSE b/python/cudf_kafka/LICENSE new file mode 120000 index 00000000000..30cff7403da --- /dev/null +++ b/python/cudf_kafka/LICENSE @@ -0,0 +1 @@ +../../LICENSE \ No newline at end of file diff --git a/python/cudf_kafka/README.md b/python/cudf_kafka/README.md new file mode 120000 index 00000000000..fe840054137 --- /dev/null +++ b/python/cudf_kafka/README.md @@ -0,0 +1 @@ +../../README.md \ No newline at end of file diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt new file mode 100644 index 00000000000..3262b7d5ebe --- /dev/null +++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt @@ -0,0 +1,62 @@ +# ============================================================================= +# Copyright (c) 2022-2023, NVIDIA CORPORATION. +# +# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except +# in compliance with the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software distributed under the License +# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express +# or implied. See the License for the specific language governing permissions and limitations under +# the License. +# ============================================================================= + +set(cython_sources kafka.pyx) +set(linked_libraries cudf_kafka::cudf_kafka) + +rapids_cython_create_modules( + CXX ASSOCIATED_TARGETS cudf_kafka + SOURCE_FILES "${cython_sources}" + LINKED_LIBRARIES "${linked_libraries}" +) + +# TODO: Finding NumPy currently requires finding Development due to a bug in CMake. This bug was +# fixed in https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7410 and will be available in +# CMake 3.24, so we can remove the Development component once we upgrade to CMake 3.24. +# find_package(Python REQUIRED COMPONENTS Development NumPy) + +# Note: The bug noted above prevents us from finding NumPy successfully using FindPython.cmake +# inside the manylinux images used to build wheels because manylinux images do not contain +# libpython.so and therefore Development cannot be found. Until we upgrade to CMake 3.24, we should +# use FindNumpy.cmake instead (provided by scikit-build). When we switch to 3.24 we can try +# switching back, but it may not work if that implicitly still requires Python libraries. In that +# case we'll need to follow up with the CMake team to remove that dependency. The stopgap solution +# is to unpack the static lib tarballs in the wheel building jobs so that there are at least static +# libs to be found, but that should be a last resort since it implies a dependency that isn't really +# necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C +# /opt/_internal" +find_package(NumPy REQUIRED) + +find_package(Python 3.9 REQUIRED COMPONENTS Interpreter) + +execute_process( + COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())" + OUTPUT_VARIABLE PYARROW_INCLUDE_DIR + ERROR_VARIABLE PYARROW_ERROR + RESULT_VARIABLE PYARROW_RESULT + OUTPUT_STRIP_TRAILING_WHITESPACE +) + +if(${PYARROW_RESULT}) + message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}") +endif() + +# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts +# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the +# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers. +# These requirements will go away once all scalar-related Cython code is removed from cudf. +foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS) + target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}") + target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}") +endforeach() diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd index ca729c62512..068837d04ee 100644 --- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd +++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd @@ -11,12 +11,12 @@ from cudf._lib.cpp.io.datasource cimport datasource from cudf._lib.io.datasource cimport Datasource -cdef extern from "kafka_callback.hpp" \ +cdef extern from "cudf_kafka/kafka_callback.hpp" \ namespace "cudf::io::external::kafka" nogil: ctypedef object (*python_callable_type)() -cdef extern from "kafka_consumer.hpp" \ +cdef extern from "cudf_kafka/kafka_consumer.hpp" \ namespace "cudf::io::external::kafka" nogil: cpdef cppclass kafka_consumer: diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml index 4829f06ab09..15431161d75 100644 --- a/python/cudf_kafka/pyproject.toml +++ b/python/cudf_kafka/pyproject.toml @@ -6,6 +6,7 @@ requires = [ "cython>=3.0.3", "numpy>=1.21,<1.25", "pyarrow==14.0.1.*", + "scikit-build>=0.13.1", "setuptools", "wheel", ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`. diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py index 6f3909d4528..6a99e9ed968 100644 --- a/python/cudf_kafka/setup.py +++ b/python/cudf_kafka/setup.py @@ -1,96 +1,13 @@ -# Copyright (c) 2020-2023, NVIDIA CORPORATION. -import os -import shutil -import sysconfig -from distutils.sysconfig import get_python_lib - -import numpy as np -import pyarrow as pa -from Cython.Build import cythonize -from setuptools import find_packages, setup -from setuptools.extension import Extension - -cython_files = ["cudf_kafka/_lib/*.pyx"] - -CUDA_HOME = os.environ.get("CUDA_HOME", False) -if not CUDA_HOME: - path_to_cuda_gdb = shutil.which("cuda-gdb") - if path_to_cuda_gdb is None: - raise OSError( - "Could not locate CUDA. " - "Please set the environment variable " - "CUDA_HOME to the path to the CUDA installation " - "and try again." - ) - CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb)) - -if not os.path.isdir(CUDA_HOME): - raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}") - -cuda_include_dir = os.path.join(CUDA_HOME, "include") - -CUDF_ROOT = os.environ.get( - "CUDF_ROOT", - os.path.abspath( - os.path.join( - os.path.dirname(os.path.abspath(__file__)), "../../cpp/build/" - ) - ), -) -CUDF_KAFKA_ROOT = os.environ.get( - "CUDF_KAFKA_ROOT", "../../cpp/libcudf_kafka/build" -) - -try: - nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0") -except Exception: - nthreads = 0 - -extensions = [ - Extension( - "*", - sources=cython_files, - include_dirs=[ - os.path.abspath(os.path.join(CUDF_ROOT, "../include/cudf")), - os.path.abspath(os.path.join(CUDF_ROOT, "../include")), - os.path.abspath( - os.path.join(CUDF_ROOT, "../libcudf_kafka/include/cudf_kafka") - ), - os.path.join(CUDF_ROOT, "include"), - os.path.join(CUDF_ROOT, "_deps/libcudacxx-src/include"), - os.path.join( - os.path.dirname(sysconfig.get_path("include")), - "rapids/libcudacxx", - ), - os.path.dirname(sysconfig.get_path("include")), - np.get_include(), - pa.get_include(), - cuda_include_dir, - ], - library_dirs=( - [ - get_python_lib(), - os.path.join(os.sys.prefix, "lib"), - CUDF_KAFKA_ROOT, - ] - ), - libraries=["cudf", "cudf_kafka"], - language="c++", - extra_compile_args=["-std=c++17", "-DFMT_HEADER_ONLY=1"], - ) -] +# Copyright (c) 2018-2023, NVIDIA CORPORATION. +from setuptools import find_packages +from skbuild import setup packages = find_packages(include=["cudf_kafka*"]) + setup( - # Include the separately-compiled shared library - ext_modules=cythonize( - extensions, - nthreads=nthreads, - compiler_directives=dict( - profile=False, language_level=3, embedsignature=True - ), - ), packages=packages, - package_data={key: ["VERSION", "*.pxd"] for key in packages}, + package_data={ + key: ["VERSION", "*.pxd", "*.hpp", "*.cuh"] for key in packages + }, zip_safe=False, ) From b0c1b7b82ccdf1a7e4159cb3bffa1984092440d4 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 14 Nov 2023 12:48:32 -0500 Subject: [PATCH 08/17] Add BytePairEncoder class to cuDF (#13891) Adds a new BytePairEncoding class to cuDF ``` >>> import cudf >>> from cudf.core.byte_pair_encoding import BytePairEncoder >>> mps = cudf.read_text('merges.txt', delimiter='\n', strip_delimiters=True) >>> bpe = BytePairEncoder(mps) >>> str_series = cudf.Series(['This is a sentence', 'thisisit']) >>> bpe(str_series) 0 This is a sent ence 1 this is it dtype: object ``` This class wraps the existing `nvtext::byte_pair_encoding` APIs to load the merge-pairs data and encode a column of strings. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/13891 --- .../cudf/_lib/cpp/nvtext/byte_pair_encode.pxd | 24 ++++++++ python/cudf/cudf/_lib/nvtext/CMakeLists.txt | 4 +- .../cudf/_lib/nvtext/byte_pair_encode.pyx | 50 ++++++++++++++++ python/cudf/cudf/core/byte_pair_encoding.py | 59 +++++++++++++++++++ .../cudf/cudf/tests/text/test_text_methods.py | 41 +++++++++++++ 5 files changed, 176 insertions(+), 2 deletions(-) create mode 100644 python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd create mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx create mode 100644 python/cudf/cudf/core/byte_pair_encoding.py diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd new file mode 100644 index 00000000000..e678e4e84db --- /dev/null +++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd @@ -0,0 +1,24 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from libcpp.memory cimport unique_ptr +from libcpp.string cimport string + +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.scalar.scalar cimport string_scalar + + +cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil: + + cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs": + pass + + cdef unique_ptr[bpe_merge_pairs] load_merge_pairs( + const column_view &merge_pairs + ) except + + + cdef unique_ptr[column] byte_pair_encoding( + const column_view &strings, + const bpe_merge_pairs &merge_pairs, + const string_scalar &separator + ) except + diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt index d4e2392ee04..d7cbdeb5bda 100644 --- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt +++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt @@ -13,8 +13,8 @@ # ============================================================================= set(cython_sources - edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx - replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx + byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx + ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx ) set(linked_libraries cudf::cudf) rapids_cython_create_modules( diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx new file mode 100644 index 00000000000..cfc76afa8a5 --- /dev/null +++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx @@ -0,0 +1,50 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + + +from cudf.core.buffer import acquire_spill_lock + +from libcpp.memory cimport unique_ptr +from libcpp.utility cimport move + +from cudf._lib.column cimport Column +from cudf._lib.cpp.column.column cimport column +from cudf._lib.cpp.column.column_view cimport column_view +from cudf._lib.cpp.nvtext.byte_pair_encode cimport ( + bpe_merge_pairs as cpp_bpe_merge_pairs, + byte_pair_encoding as cpp_byte_pair_encoding, + load_merge_pairs as cpp_load_merge_pairs, +) +from cudf._lib.cpp.scalar.scalar cimport string_scalar +from cudf._lib.scalar cimport DeviceScalar + + +cdef class BPEMergePairs: + cdef unique_ptr[cpp_bpe_merge_pairs] c_obj + + def __cinit__(self, Column merge_pairs): + cdef column_view c_pairs = merge_pairs.view() + with nogil: + self.c_obj = move(cpp_load_merge_pairs(c_pairs)) + + +@acquire_spill_lock() +def byte_pair_encoding( + Column strings, + BPEMergePairs merge_pairs, + object separator +): + cdef column_view c_strings = strings.view() + cdef DeviceScalar d_separator = separator.device_value + cdef const string_scalar* c_separator = d_separator\ + .get_raw_ptr() + cdef unique_ptr[column] c_result + with nogil: + c_result = move( + cpp_byte_pair_encoding( + c_strings, + merge_pairs.c_obj.get()[0], + c_separator[0] + ) + ) + + return Column.from_unique_ptr(move(c_result)) diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py new file mode 100644 index 00000000000..4c881022ecf --- /dev/null +++ b/python/cudf/cudf/core/byte_pair_encoding.py @@ -0,0 +1,59 @@ +# Copyright (c) 2023, NVIDIA CORPORATION. + +from __future__ import annotations + +import cudf +from cudf._lib.nvtext.byte_pair_encode import ( + BPEMergePairs as cpp_merge_pairs, + byte_pair_encoding as cpp_byte_pair_encoding, +) + + +class BytePairEncoder: + """ + Given a merge pairs strings series, performs byte pair encoding on + a strings series using the provided separator. + + Parameters + ---------- + merges_pairs : str + Strings column of merge pairs + + Returns + ------- + BytePairEncoder + """ + + def __init__(self, merges_pair: "cudf.Series"): + self.merge_pairs = cpp_merge_pairs(merges_pair._column) + + def __call__(self, text, separator: str = " "): + """ + + Parameters + ---------- + text : cudf string series + The strings to be encoded. + + Returns + ------- + Encoded strings + + Examples + -------- + >>> import cudf + >>> from cudf.core.byte_pair_encoding import BytePairEncoder + >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t", + ... "c e", "es t", "en ce", "T h", "Th is", + ... "t est", "s ent", "t h", "th is"]) + >>> bpe = BytePairEncoder(mps) + >>> str_series = cudf.Series(['This is the sentence', 'thisisit']) + >>> bpe(str_series) + 0 This is a sent ence + 1 this is it + dtype: object + """ + sep = cudf.Scalar(separator, dtype="str") + result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep) + + return cudf.Series(result) diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py index e565df8f3da..2dccd583b23 100644 --- a/python/cudf/cudf/tests/text/test_text_methods.py +++ b/python/cudf/cudf/tests/text/test_text_methods.py @@ -7,6 +7,7 @@ import pytest import cudf +from cudf.core.byte_pair_encoding import BytePairEncoder from cudf.core.tokenize_vocabulary import TokenizeVocabulary from cudf.testing._utils import assert_eq @@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings(): actual = str1.str.jaccard_index(str2, jaccard_width) assert_eq(expected, actual) + + +@pytest.mark.parametrize( + "separator, input, results", + [ + (" ", "thetestsentence", "the test sent ence"), + ("_", "sentenceistest", "sent_ence_is_test"), + ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"), + ], +) +def test_byte_pair_encoding(separator, input, results): + pairs_table = cudf.Series( + [ + "t he", + "h e", + "e n", + "i t", + "i s", + "e s", + "en t", + "c e", + "es t", + "en ce", + "t h", + "h i", + "th is", + "t est", + "s i", + "s ent", + ] + ) + encoder = BytePairEncoder(pairs_table) + + strings = cudf.Series([input, None, "", input]) + + expected = cudf.Series([results, None, "", results]) + + actual = encoder(strings, separator) + assert type(expected) == type(actual) + assert_eq(expected, actual) From b446a6f187241e765c925da1053ece2679313a06 Mon Sep 17 00:00:00 2001 From: David Wendt <45795991+davidwendt@users.noreply.github.com> Date: Tue, 14 Nov 2023 12:49:19 -0500 Subject: [PATCH 09/17] Fix token-count logic in nvtext::tokenize_with_vocabulary (#14393) Fixes a bug introduced in #14336 when trying to simplify the token-counting logic as per this discussion https://github.com/rapidsai/cudf/pull/14336#discussion_r1378173552 The simplification caused an error which was found when running the nvtext benchmarks. The appropriate gtest has been updated to cover this case now. Authors: - David Wendt (https://github.com/davidwendt) Approvers: - Bradley Dice (https://github.com/bdice) - Karthikeyan (https://github.com/karthikeyann) URL: https://github.com/rapidsai/cudf/pull/14393 --- cpp/benchmarks/text/vocab.cpp | 2 +- cpp/src/text/vocabulary_tokenize.cu | 8 ++++++-- cpp/tests/text/tokenize_tests.cpp | 12 ++++++------ 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp index 6922b7214ff..80942e2697d 100644 --- a/cpp/benchmarks/text/vocab.cpp +++ b/cpp/benchmarks/text/vocab.cpp @@ -53,7 +53,7 @@ static void bench_vocab_tokenize(nvbench::state& state) auto const vocab_col = [] { data_profile const profile = data_profile_builder().no_validity().distribution( - cudf::type_id::STRING, distribution_id::NORMAL, 0, 5); + cudf::type_id::STRING, distribution_id::NORMAL, 0, 15); auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile); return cudf::strings::filter_characters_of_type( cudf::strings_column_view(col->view()), diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu index 41f8c0a8731..511f1995374 100644 --- a/cpp/src/text/vocabulary_tokenize.cu +++ b/cpp/src/text/vocabulary_tokenize.cu @@ -276,8 +276,12 @@ __global__ void token_counts_fn(cudf::column_device_view const d_strings, __syncwarp(); for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) { - // add one if at the edge of a token or at the string's end - count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end)); + // add one if at the edge of a token or if at the string's end + if (*itr) { + count += !(*(itr - 1)); + } else { + count += (itr + 1 == d_output_end); + } } __syncwarp(); diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp index 8118183a458..ea36e13de6f 100644 --- a/cpp/tests/text/tokenize_tests.cpp +++ b/cpp/tests/text/tokenize_tests.cpp @@ -246,14 +246,14 @@ TEST_F(TextTokenizeTest, Vocabulary) TEST_F(TextTokenizeTest, VocabularyLongStrings) { - cudf::test::strings_column_wrapper vocabulary( // leaving out 'cat' on purpose + cudf::test::strings_column_wrapper vocabulary( {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"}); auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary)); std::vector h_strings( 4, "the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse " - "jumped over the mouse house with the dog"); + "jumped over the mousé house with the dog "); cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end()); auto input_view = cudf::strings_column_view(input); auto delimiter = cudf::string_scalar(" "); @@ -262,10 +262,10 @@ TEST_F(TextTokenizeTest, VocabularyLongStrings) using LCW = cudf::test::lists_column_wrapper; // clang-format off - LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}, - LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}, - LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}, - LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}}); + LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}, + LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}, + LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}, + LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}}); // clang-format on CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected); From 8106a0c3d2050786f42152a280bd9315b897379e Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Tue, 14 Nov 2023 16:03:54 -0600 Subject: [PATCH 10/17] Cleanup remaining usages of dask dependencies (#14407) This PR switches remaining usages of `dask` dependencies to use `rapids-dask-dependency` Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Bradley Dice (https://github.com/bdice) - Jake Awe (https://github.com/AyodeAwe) - Vyas Ramasubramani (https://github.com/vyasr) URL: https://github.com/rapidsai/cudf/pull/14407 --- conda/recipes/custreamz/meta.yaml | 4 +--- conda/recipes/dask-cudf/run_test.sh | 36 ----------------------------- 2 files changed, 1 insertion(+), 39 deletions(-) delete mode 100644 conda/recipes/dask-cudf/run_test.sh diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index fb6efabffd4..b8c5918ea60 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -45,9 +45,7 @@ requirements: - streamz - cudf ={{ version }} - cudf_kafka ={{ version }} - - dask >=2023.9.2 - - dask-core >=2023.9.2 - - distributed >=2023.9.2 + - rapids-dask-dependency ={{ version }} - python-confluent-kafka >=1.9.0,<1.10.0a0 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh deleted file mode 100644 index e7238d00f2b..00000000000 --- a/conda/recipes/dask-cudf/run_test.sh +++ /dev/null @@ -1,36 +0,0 @@ -#!/bin/bash -# Copyright (c) 2020-2023, NVIDIA CORPORATION. - -set -e - -# Logger function for build status output -function logger() { - echo -e "\n>>>> $@\n" -} - -# Importing cudf on arm64 CPU only nodes is currently not working due to a -# difference in reported gpu devices between arm64 and amd64 -ARCH=$(arch) - -if [ "${ARCH}" = "aarch64" ]; then - logger "Skipping tests on arm64" - exit 0 -fi - -# Dask & Distributed option to install main(nightly) or `conda-forge` packages. -export INSTALL_DASK_MAIN=1 - -# Dask version to install when `INSTALL_DASK_MAIN=0` -export DASK_STABLE_VERSION="2023.9.2" - -# Install the conda-forge or nightly version of dask and distributed -if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then - rapids-logger "rapids-mamba-retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'" - rapids-mamba-retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed" -else - rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall" - rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall -fi - -logger "python -c 'import dask_cudf'" -python -c "import dask_cudf" From 27b052d01ebdfd3690b90588971817423614acc0 Mon Sep 17 00:00:00 2001 From: shrshi Date: Tue, 14 Nov 2023 14:39:07 -0800 Subject: [PATCH 11/17] Added streams to CSV reader and writer api (#14340) This PR contributes to https://github.com/rapidsai/cudf/issues/13744. -Added stream parameters to public APIs `cudf::io::read_csv` `cudf::io::write_csv` -Added stream gtests Authors: - https://github.com/shrshi - Karthikeyan (https://github.com/karthikeyann) Approvers: - Karthikeyan (https://github.com/karthikeyann) - Vukasin Milovanovic (https://github.com/vuule) - Yunsong Wang (https://github.com/PointKernel) URL: https://github.com/rapidsai/cudf/pull/14340 --- cpp/include/cudf/io/csv.hpp | 4 + cpp/include/cudf/io/detail/csv.hpp | 1 - cpp/include/cudf_test/column_wrapper.hpp | 16 ++-- cpp/src/io/csv/writer_impl.cu | 38 +++++---- cpp/src/io/functions.cpp | 12 ++- cpp/tests/CMakeLists.txt | 1 + cpp/tests/streams/io/csv_test.cpp | 102 +++++++++++++++++++++++ 7 files changed, 150 insertions(+), 24 deletions(-) create mode 100644 cpp/tests/streams/io/csv_test.cpp diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp index ac885c54356..435583e805d 100644 --- a/cpp/include/cudf/io/csv.hpp +++ b/cpp/include/cudf/io/csv.hpp @@ -1307,6 +1307,7 @@ class csv_reader_options_builder { * @endcode * * @param options Settings for controlling reading behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource used to allocate device memory of the table in the returned * table_with_metadata * @@ -1314,6 +1315,7 @@ class csv_reader_options_builder { */ table_with_metadata read_csv( csv_reader_options options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group @@ -1715,9 +1717,11 @@ class csv_writer_options_builder { * @endcode * * @param options Settings for controlling writing behavior + * @param stream CUDA stream used for device memory operations and kernel launches * @param mr Device memory resource to use for device memory allocation */ void write_csv(csv_writer_options const& options, + rmm::cuda_stream_view stream = cudf::get_default_stream(), rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()); /** @} */ // end of group diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp index 9fdc7a47fb9..40ddcf385b0 100644 --- a/cpp/include/cudf/io/detail/csv.hpp +++ b/cpp/include/cudf/io/detail/csv.hpp @@ -17,7 +17,6 @@ #pragma once #include -#include #include diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp index e94dfea9dcf..b9f2e0d9868 100644 --- a/cpp/include/cudf_test/column_wrapper.hpp +++ b/cpp/include/cudf_test/column_wrapper.hpp @@ -803,7 +803,8 @@ class strings_column_wrapper : public detail::column_wrapper { offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); auto d_bitmask = cudf::detail::make_device_uvector_sync( null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource()); - wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count); + wrapped = cudf::make_strings_column( + d_chars, d_offsets, d_bitmask, null_count, cudf::test::get_default_stream()); } /** @@ -1846,7 +1847,8 @@ class structs_column_wrapper : public detail::column_wrapper { child_column_wrappers.end(), std::back_inserter(child_columns), [&](auto const& column_wrapper) { - return std::make_unique(column_wrapper.get()); + return std::make_unique(column_wrapper.get(), + cudf::test::get_default_stream()); }); init(std::move(child_columns), validity); } @@ -1882,7 +1884,8 @@ class structs_column_wrapper : public detail::column_wrapper { child_column_wrappers.end(), std::back_inserter(child_columns), [&](auto const& column_wrapper) { - return std::make_unique(column_wrapper.get()); + return std::make_unique(column_wrapper.get(), + cudf::test::get_default_stream()); }); init(std::move(child_columns), validity_iter); } @@ -1906,8 +1909,11 @@ class structs_column_wrapper : public detail::column_wrapper { return cudf::test::detail::make_null_mask(validity.begin(), validity.end()); }(); - wrapped = cudf::make_structs_column( - num_rows, std::move(child_columns), null_count, std::move(null_mask)); + wrapped = cudf::make_structs_column(num_rows, + std::move(child_columns), + null_count, + std::move(null_mask), + cudf::test::get_default_stream()); } template diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu index 8c586306ad5..6e9c634804c 100644 --- a/cpp/src/io/csv/writer_impl.cu +++ b/cpp/src/io/csv/writer_impl.cu @@ -146,6 +146,12 @@ struct column_to_strings_fn { { } + ~column_to_strings_fn() = default; + column_to_strings_fn(column_to_strings_fn const&) = delete; + column_to_strings_fn& operator=(column_to_strings_fn const&) = delete; + column_to_strings_fn(column_to_strings_fn&&) = delete; + column_to_strings_fn& operator=(column_to_strings_fn&&) = delete; + // Note: `null` replacement with `na_rep` deferred to `concatenate()` // instead of column-wise; might be faster // @@ -160,8 +166,9 @@ struct column_to_strings_fn { std::enable_if_t, std::unique_ptr> operator()( column_view const& column) const { - return cudf::strings::detail::from_booleans( - column, options_.get_true_value(), options_.get_false_value(), stream_, mr_); + string_scalar true_string{options_.get_true_value(), true, stream_}; + string_scalar false_string{options_.get_false_value(), true, stream_}; + return cudf::strings::detail::from_booleans(column, true_string, false_string, stream_, mr_); } // strings: @@ -367,10 +374,10 @@ void write_chunked(data_sink* out_sink, CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column."); - cudf::string_scalar newline{options.get_line_terminator()}; + cudf::string_scalar newline{options.get_line_terminator(), true, stream}; auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view, newline, - string_scalar("", false), + string_scalar{"", false, stream}, stream, rmm::mr::get_current_device_resource()); strings_column_view strings_column{p_str_col_w_nl->view()}; @@ -455,12 +462,14 @@ void write_csv(data_sink* out_sink, // populate vector of string-converted columns: // - std::transform(sub_view.begin(), - sub_view.end(), - std::back_inserter(str_column_vec), - [converter](auto const& current_col) { - return cudf::type_dispatcher(current_col.type(), converter, current_col); - }); + std::transform( + sub_view.begin(), + sub_view.end(), + std::back_inserter(str_column_vec), + [&converter = std::as_const(converter)](auto const& current_col) { + return cudf::type_dispatcher( + current_col.type(), converter, current_col); + }); // create string table view from str_column_vec: // @@ -470,18 +479,19 @@ void write_csv(data_sink* out_sink, // concatenate columns in each row into one big string column // (using null representation and delimiter): // - std::string delimiter_str{options.get_inter_column_delimiter()}; auto str_concat_col = [&] { + cudf::string_scalar delimiter_str{ + std::string{options.get_inter_column_delimiter()}, true, stream}; + cudf::string_scalar options_narep{options.get_na_rep(), true, stream}; if (str_table_view.num_columns() > 1) return cudf::strings::detail::concatenate(str_table_view, delimiter_str, - options.get_na_rep(), + options_narep, strings::separator_on_nulls::YES, stream, rmm::mr::get_current_device_resource()); - cudf::string_scalar narep{options.get_na_rep()}; return cudf::strings::detail::replace_nulls( - str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource()); + str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource()); }(); write_chunked(out_sink, str_concat_col->view(), options, stream, mr); diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp index 00d56008611..964e40e36cd 100644 --- a/cpp/src/io/functions.cpp +++ b/cpp/src/io/functions.cpp @@ -230,7 +230,9 @@ void write_json(json_writer_options const& options, mr); } -table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr) +table_with_metadata read_csv(csv_reader_options options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { CUDF_FUNC_RANGE(); @@ -245,12 +247,14 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_ return cudf::io::detail::csv::read_csv( // std::move(datasources[0]), options, - cudf::get_default_stream(), + stream, mr); } // Freeform API wraps the detail writer class API -void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resource* mr) +void write_csv(csv_writer_options const& options, + rmm::cuda_stream_view stream, + rmm::mr::device_memory_resource* mr) { using namespace cudf::io::detail; @@ -262,7 +266,7 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc options.get_table(), options.get_names(), options, - cudf::get_default_stream(), + stream, mr); } diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt index 7b628649051..1be8566fb0f 100644 --- a/cpp/tests/CMakeLists.txt +++ b/cpp/tests/CMakeLists.txt @@ -635,6 +635,7 @@ ConfigureTest( ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing) +ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing) ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing) diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp new file mode 100644 index 00000000000..88514fa412c --- /dev/null +++ b/cpp/tests/streams/io/csv_test.cpp @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2023, NVIDIA CORPORATION. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include +#include + +auto const temp_env = static_cast( + ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment)); + +class CSVTest : public cudf::test::BaseFixture {}; + +TEST_F(CSVTest, CSVWriter) +{ + constexpr auto num_rows = 10; + + std::vector zeros(num_rows, 0); + std::vector ones(num_rows, 1); + auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{12}}; + }); + auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{-12}}; + }); + + cudf::test::fixed_width_column_wrapper col0(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col1(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col2(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col3(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col4(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col5(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col6(col6_data, col6_data + num_rows); + cudf::test::fixed_width_column_wrapper col7(col7_data, col7_data + num_rows); + + std::vector col8_data(num_rows, "rapids"); + cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end()); + + cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8}); + + auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv"; + auto w_options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab) + .include_header(false) + .inter_column_delimiter(','); + cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream()); +} + +TEST_F(CSVTest, CSVReader) +{ + constexpr auto num_rows = 10; + + std::vector zeros(num_rows, 0); + std::vector ones(num_rows, 1); + auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{12}}; + }); + auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) { + return numeric::decimal128{ones[i], numeric::scale_type{-12}}; + }); + + cudf::test::fixed_width_column_wrapper col0(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col1(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col2(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col3(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col4(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col5(zeros.begin(), zeros.end()); + cudf::test::fixed_width_column_wrapper col6(col6_data, col6_data + num_rows); + cudf::test::fixed_width_column_wrapper col7(col7_data, col7_data + num_rows); + + std::vector col8_data(num_rows, "rapids"); + cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end()); + + cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8}); + + auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv"; + auto w_options = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab) + .include_header(false) + .inter_column_delimiter(','); + cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream()); +} From 330d389b26a05676d9f079503a3d96b571762337 Mon Sep 17 00:00:00 2001 From: Robert Maynard Date: Tue, 14 Nov 2023 17:56:55 -0500 Subject: [PATCH 12/17] Ensure nvbench initializes nvml context when built statically (#14411) Port https://github.com/NVIDIA/nvbench/pull/148 to cudf so that nvbench benchmarks work now that we always use a static version of nvbench. Authors: - Robert Maynard (https://github.com/robertmaynard) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14411 --- cpp/cmake/thirdparty/patches/nvbench_override.json | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json index ad9b19c29c1..f85bdb9486c 100644 --- a/cpp/cmake/thirdparty/patches/nvbench_override.json +++ b/cpp/cmake/thirdparty/patches/nvbench_override.json @@ -7,6 +7,11 @@ "file" : "${current_json_dir}/nvbench_global_setup.diff", "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]", "fixed_in" : "" + }, + { + "file" : "nvbench/nvml_with_static_builds.diff", + "issue" : "Add support for nvml with static nvbench [https://github.com/NVIDIA/nvbench/pull/148]", + "fixed_in" : "" } ] } From 8a0a08f34ff804a7329ea640aa1e0a9b188d2162 Mon Sep 17 00:00:00 2001 From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com> Date: Tue, 14 Nov 2023 17:55:16 -1000 Subject: [PATCH 13/17] Fix as_column(pd.Timestamp/Timedelta, length=) not respecting length (#14390) Noticed this while trying to clean up `as_column` Authors: - Matthew Roeschke (https://github.com/mroeschke) Approvers: - Bradley Dice (https://github.com/bdice) URL: https://github.com/rapidsai/cudf/pull/14390 --- python/cudf/cudf/core/column/column.py | 5 ++++- python/cudf/cudf/tests/test_column.py | 13 ++++++++----- 2 files changed, 12 insertions(+), 6 deletions(-) diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py index a5e99abd79e..b4f65693d85 100644 --- a/python/cudf/cudf/core/column/column.py +++ b/python/cudf/cudf/core/column/column.py @@ -2102,7 +2102,10 @@ def as_column( elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)): # This will always treat NaTs as nulls since it's not technically a # discrete value like NaN - data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True)) + length = length or 1 + data = as_column( + pa.array(pd.Series([arbitrary] * length), from_pandas=True) + ) if dtype is not None: data = data.astype(dtype) diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py index db0446d506c..0546638f388 100644 --- a/python/cudf/cudf/tests/test_column.py +++ b/python/cudf/cudf/tests/test_column.py @@ -193,12 +193,15 @@ def test_column_mixed_dtype(data, error): @pytest.mark.parametrize("nan_as_null", [True, False]) -def test_as_column_scalar_with_nan(nan_as_null): - size = 10 - scalar = np.nan - +@pytest.mark.parametrize( + "scalar", + [np.nan, pd.Timedelta(days=1), pd.Timestamp(2020, 1, 1)], + ids=repr, +) +@pytest.mark.parametrize("size", [1, 10]) +def test_as_column_scalar_with_nan(nan_as_null, scalar, size): expected = ( - cudf.Series([np.nan] * size, nan_as_null=nan_as_null) + cudf.Series([scalar] * size, nan_as_null=nan_as_null) .dropna() .to_numpy() ) From ab2248ea8e693143823d02bb8b806c65bfc3bf32 Mon Sep 17 00:00:00 2001 From: Vukasin Milovanovic Date: Tue, 14 Nov 2023 23:30:51 -0800 Subject: [PATCH 14/17] Fix and disable encoding for nanosecond statistics in ORC writer (#14367) Issue https://github.com/rapidsai/cudf/issues/14325 Use uint when reading/writing nano stats because nanoseconds have int32 encoding (different from both unit32 and sint32, _obviously_), which does not use zigzag. sint32 uses zigzag, and unit32 does not allow negative numbers, so we can use uint since we'll never have negative nanoseconds. Also disabled the nanoseconds because it should only be written after ORC-135; we don't write the version so readers get confused if nanoseconds are there. Planning to re-enable once we start writing the version. Authors: - Vukasin Milovanovic (https://github.com/vuule) Approvers: - Vyas Ramasubramani (https://github.com/vyasr) - Nghia Truong (https://github.com/ttnghia) URL: https://github.com/rapidsai/cudf/pull/14367 --- cpp/include/cudf/io/orc_metadata.hpp | 8 +++---- cpp/src/io/orc/orc.cpp | 13 +++++++++++ cpp/src/io/orc/orc.hpp | 6 +++++ cpp/src/io/orc/stats_enc.cu | 35 +++++++++++++++++++++------- cpp/tests/io/orc_test.cpp | 16 +++++++++---- 5 files changed, 61 insertions(+), 17 deletions(-) diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp index 82d59803c25..9531a012e49 100644 --- a/cpp/include/cudf/io/orc_metadata.hpp +++ b/cpp/include/cudf/io/orc_metadata.hpp @@ -141,10 +141,10 @@ using binary_statistics = sum_statistics; * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC. */ struct timestamp_statistics : minmax_statistics { - std::optional minimum_utc; ///< minimum in milliseconds - std::optional maximum_utc; ///< maximum in milliseconds - std::optional minimum_nanos; ///< nanoseconds part of the minimum - std::optional maximum_nanos; ///< nanoseconds part of the maximum + std::optional minimum_utc; ///< minimum in milliseconds + std::optional maximum_utc; ///< maximum in milliseconds + std::optional minimum_nanos; ///< nanoseconds part of the minimum + std::optional maximum_nanos; ///< nanoseconds part of the maximum }; namespace orc { diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp index bc399b75ef9..ee5fa4e8b5a 100644 --- a/cpp/src/io/orc/orc.cpp +++ b/cpp/src/io/orc/orc.cpp @@ -182,6 +182,19 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen) field_reader(5, s.minimum_nanos), field_reader(6, s.maximum_nanos)); function_builder(s, maxlen, op); + + // Adjust nanoseconds because they are encoded as (value + 1) + // Range [1, 1000'000] is translated here to [0, 999'999] + if (s.minimum_nanos.has_value()) { + auto& min_nanos = s.minimum_nanos.value(); + CUDF_EXPECTS(min_nanos >= 1 and min_nanos <= 1000'000, "Invalid minimum nanoseconds"); + --min_nanos; + } + if (s.maximum_nanos.has_value()) { + auto& max_nanos = s.maximum_nanos.value(); + CUDF_EXPECTS(max_nanos >= 1 and max_nanos <= 1000'000, "Invalid maximum nanoseconds"); + --max_nanos; + } } void ProtobufReader::read(column_statistics& s, size_t maxlen) diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp index 6f65e384d2d..783ed4206b6 100644 --- a/cpp/src/io/orc/orc.hpp +++ b/cpp/src/io/orc/orc.hpp @@ -41,6 +41,12 @@ static constexpr uint32_t block_header_size = 3; // Seconds from January 1st, 1970 to January 1st, 2015 static constexpr int64_t orc_utc_epoch = 1420070400; +// Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max +// are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder, +// the maximum nanosecond remainder is 999,999 (nanoseconds in a millisecond - 1). +static constexpr int32_t DEFAULT_MIN_NANOS = 0; +static constexpr int32_t DEFAULT_MAX_NANOS = 999'999; + struct PostScript { uint64_t footerLength = 0; // the length of the footer section in bytes CompressionKind compression = NONE; // the kind of generic compression used diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu index 479a2dfada3..429fd5b929d 100644 --- a/cpp/src/io/orc/stats_enc.cu +++ b/cpp/src/io/orc/stats_enc.cu @@ -27,6 +27,10 @@ namespace cudf::io::orc::gpu { using strings::detail::fixed_point_string_size; +// Nanosecond statistics should not be enabled until the spec version is set correctly in the output +// files. See https://github.com/rapidsai/cudf/issues/14325 for more details +constexpr bool enable_nanosecond_statistics = false; + constexpr unsigned int init_threads_per_group = 32; constexpr unsigned int init_groups_per_block = 4; constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block; @@ -96,8 +100,10 @@ __global__ void __launch_bounds__(block_size, 1) stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64); break; case dtype_timestamp64: - stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) + - 2 * (pb_fld_hdrlen + pb_fldlen_int32); + stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64); + if constexpr (enable_nanosecond_statistics) { + stats_len += 2 * (pb_fld_hdrlen + pb_fldlen_int32); + } break; case dtype_float32: case dtype_float64: @@ -405,7 +411,8 @@ __global__ void __launch_bounds__(encode_threads_per_block) // optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch // optional sint64 maximumUtc = 4; // optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond - // precision optional int32 maximumNanos = 6; + // precision + // optional int32 maximumNanos = 6; // } if (s->chunk.has_minmax) { cur[0] = 9 * 8 + ProtofType::FIXEDLEN; @@ -416,12 +423,22 @@ __global__ void __launch_bounds__(encode_threads_per_block) split_nanosecond_timestamp(s->chunk.max_value.i_val); // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC - cur = pb_put_int(cur, 1, min_ms); // minimum - cur = pb_put_int(cur, 2, max_ms); // maximum - cur = pb_put_int(cur, 3, min_ms); // minimumUtc - cur = pb_put_int(cur, 4, max_ms); // maximumUtc - cur = pb_put_int(cur, 5, min_ns_remainder); // minimumNanos - cur = pb_put_int(cur, 6, max_ns_remainder); // maximumNanos + cur = pb_put_int(cur, 1, min_ms); // minimum + cur = pb_put_int(cur, 2, max_ms); // maximum + cur = pb_put_int(cur, 3, min_ms); // minimumUtc + cur = pb_put_int(cur, 4, max_ms); // maximumUtc + + if constexpr (enable_nanosecond_statistics) { + if (min_ns_remainder != DEFAULT_MIN_NANOS) { + // using uint because positive values are not zigzag encoded + cur = pb_put_uint(cur, 5, min_ns_remainder + 1); // minimumNanos + } + if (max_ns_remainder != DEFAULT_MAX_NANOS) { + // using uint because positive values are not zigzag encoded + cur = pb_put_uint(cur, 6, max_ns_remainder + 1); // maximumNanos + } + } + fld_start[1] = cur - (fld_start + 2); } break; diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp index 234716749ff..dca3886db14 100644 --- a/cpp/tests/io/orc_test.cpp +++ b/cpp/tests/io/orc_test.cpp @@ -1054,8 +1054,12 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*ts4.maximum, 3); EXPECT_EQ(*ts4.minimum_utc, -4); EXPECT_EQ(*ts4.maximum_utc, 3); - EXPECT_EQ(*ts4.minimum_nanos, 999994); - EXPECT_EQ(*ts4.maximum_nanos, 6); + // nanosecond precision can't be included until we write a writer version that includes ORC-135 + // see https://github.com/rapidsai/cudf/issues/14325 + // EXPECT_EQ(*ts4.minimum_nanos, 999994); + EXPECT_FALSE(ts4.minimum_nanos.has_value()); + // EXPECT_EQ(*ts4.maximum_nanos, 6); + EXPECT_FALSE(ts4.maximum_nanos.has_value()); auto& s5 = stats[5]; EXPECT_EQ(*s5.number_of_values, 4ul); @@ -1065,8 +1069,12 @@ TEST_F(OrcStatisticsTest, Basic) EXPECT_EQ(*ts5.maximum, 3000); EXPECT_EQ(*ts5.minimum_utc, -3001); EXPECT_EQ(*ts5.maximum_utc, 3000); - EXPECT_EQ(*ts5.minimum_nanos, 994000); - EXPECT_EQ(*ts5.maximum_nanos, 6000); + // nanosecond precision can't be included until we write a writer version that includes ORC-135 + // see https://github.com/rapidsai/cudf/issues/14325 + // EXPECT_EQ(*ts5.minimum_nanos, 994000); + EXPECT_FALSE(ts5.minimum_nanos.has_value()); + // EXPECT_EQ(*ts5.maximum_nanos, 6000); + EXPECT_FALSE(ts5.maximum_nanos.has_value()); auto& s6 = stats[6]; EXPECT_EQ(*s6.number_of_values, 4ul); From 8deb3dd7573000e7d87f18a9e2bbe39cf2932e10 Mon Sep 17 00:00:00 2001 From: GALI PREM SAGAR Date: Wed, 15 Nov 2023 07:56:37 -0600 Subject: [PATCH 15/17] Raise error in `reindex` when `index` is not unique (#14400) Fixes: #14398 This PR raises an error in `reindex` API when reindexing is performed on a non-unique index column. Authors: - GALI PREM SAGAR (https://github.com/galipremsagar) Approvers: - Matthew Roeschke (https://github.com/mroeschke) - Lawrence Mitchell (https://github.com/wence-) URL: https://github.com/rapidsai/cudf/pull/14400 --- python/cudf/cudf/core/indexed_frame.py | 4 ++++ python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++ python/cudf/cudf/tests/test_series.py | 12 ++++++++++++ python/dask_cudf/dask_cudf/backends.py | 13 ++++--------- 4 files changed, 32 insertions(+), 9 deletions(-) diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py index 376bef6d0b2..4211a8c24bf 100644 --- a/python/cudf/cudf/core/indexed_frame.py +++ b/python/cudf/cudf/core/indexed_frame.py @@ -2607,6 +2607,10 @@ def _reindex( df = self if index is not None: + if not df._index.is_unique: + raise ValueError( + "cannot reindex on an axis with duplicate labels" + ) index = cudf.core.index.as_index( index, name=getattr(index, "name", self._index.name) ) diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py index d44cf594e8b..5677f97408a 100644 --- a/python/cudf/cudf/tests/test_dataframe.py +++ b/python/cudf/cudf/tests/test_dataframe.py @@ -10723,3 +10723,15 @@ def test_dataframe_series_dot(): expected = gser @ [12, 13] assert_eq(expected, actual) + + +def test_dataframe_duplicate_index_reindex(): + gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1]) + pdf = gdf.to_pandas() + + assert_exceptions_equal( + gdf.reindex, + pdf.reindex, + lfunc_args_and_kwargs=([10, 11, 12, 13], {}), + rfunc_args_and_kwargs=([10, 11, 12, 13], {}), + ) diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py index 8f8f87c20e0..c15a797713f 100644 --- a/python/cudf/cudf/tests/test_series.py +++ b/python/cudf/cudf/tests/test_series.py @@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype(): s = cudf.Series([True, False, True]) with pytest.raises(TypeError): s[0] = 10 + + +def test_series_duplicate_index_reindex(): + gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1]) + ps = gs.to_pandas() + + assert_exceptions_equal( + gs.reindex, + ps.reindex, + lfunc_args_and_kwargs=([10, 11, 12, 13], {}), + rfunc_args_and_kwargs=([10, 11, 12, 13], {}), + ) diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py index b0da82eaeee..387643587d1 100644 --- a/python/dask_cudf/dask_cudf/backends.py +++ b/python/dask_cudf/dask_cudf/backends.py @@ -427,17 +427,12 @@ def union_categoricals_cudf( ) -@_dask_cudf_nvtx_annotate -def safe_hash(frame): - return cudf.Series(frame.hash_values(), index=frame.index) - - @hash_object_dispatch.register((cudf.DataFrame, cudf.Series)) @_dask_cudf_nvtx_annotate def hash_object_cudf(frame, index=True): if index: - return safe_hash(frame.reset_index()) - return safe_hash(frame) + frame = frame.reset_index() + return frame.hash_values() @hash_object_dispatch.register(cudf.BaseIndex) @@ -445,10 +440,10 @@ def hash_object_cudf(frame, index=True): def hash_object_cudf_index(ind, index=None): if isinstance(ind, cudf.MultiIndex): - return safe_hash(ind.to_frame(index=False)) + return ind.to_frame(index=False).hash_values() col = cudf.core.column.as_column(ind) - return safe_hash(cudf.Series(col)) + return cudf.Series(col).hash_values() @group_split_dispatch.register((cudf.Series, cudf.DataFrame)) From 9e7f8a5fdd03d6a24630687621d0ee14c2db26d7 Mon Sep 17 00:00:00 2001 From: Vyas Ramasubramani Date: Wed, 15 Nov 2023 15:27:28 -0800 Subject: [PATCH 16/17] Fix dask dependency in custreamz (#14420) #14407 added a dask dependency to custreamz, but it added too tight of a pinning by requiring the exact same version. This is not valid because rapids-dask-dependency won't release a new version corresponding to each new cudf release, so pinning to the exact same version up to the alpha creates an unsatisfiable constraint. Authors: - Vyas Ramasubramani (https://github.com/vyasr) Approvers: - Ray Douglass (https://github.com/raydouglass) - Bradley Dice (https://github.com/bdice) - GALI PREM SAGAR (https://github.com/galipremsagar) --- conda/recipes/custreamz/meta.yaml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml index b8c5918ea60..755394e3936 100644 --- a/conda/recipes/custreamz/meta.yaml +++ b/conda/recipes/custreamz/meta.yaml @@ -45,7 +45,7 @@ requirements: - streamz - cudf ={{ version }} - cudf_kafka ={{ version }} - - rapids-dask-dependency ={{ version }} + - rapids-dask-dependency ={{ minor_version }} - python-confluent-kafka >=1.9.0,<1.10.0a0 - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }} From e4e69757b340ce45e8ceca53047f079c8b3eb648 Mon Sep 17 00:00:00 2001 From: Bradley Dice Date: Wed, 15 Nov 2023 16:31:03 -0800 Subject: [PATCH 17/17] Update cudf_kafka_version. --- python/cudf_kafka/CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt index d55c3fdc076..1e21c873585 100644 --- a/python/cudf_kafka/CMakeLists.txt +++ b/python/cudf_kafka/CMakeLists.txt @@ -14,7 +14,7 @@ cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR) -set(cudf_kafka_version 23.12.00) +set(cudf_kafka_version 24.02.00) include(../../fetch_rapids.cmake)