From 9be4de53268d49665bc0d700f12f1192207fff79 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Thu, 9 Nov 2023 16:00:43 -0800
Subject: [PATCH 01/17] Upgrade to nvCOMP 3.0.4 (#13815)

Update the nvCOMP version used for cuIO compression/decompression to 3.0.4.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)
  - Bradley Dice (https://github.com/bdice)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Ray Douglass (https://github.com/raydouglass)

URL: https://github.com/rapidsai/cudf/pull/13815
---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 2 +-
 conda/environments/all_cuda-120_arch-x86_64.yaml | 2 +-
 conda/recipes/libcudf/conda_build_config.yaml    | 2 +-
 dependencies.yaml                                | 2 +-
 4 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 4d5c56e4a7d..a479d517c24 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -60,7 +60,7 @@ dependencies:
 - numpy>=1.21,<1.25
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==2.6.1
+- nvcomp==3.0.4
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 8606932dae4..d1779aaeeac 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -59,7 +59,7 @@ dependencies:
 - numba>=0.57,<0.58
 - numpy>=1.21,<1.25
 - numpydoc
-- nvcomp==2.6.1
+- nvcomp==3.0.4
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 05b2135184b..fa06ed048b7 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -38,7 +38,7 @@ spdlog_version:
   - ">=1.11.0,<1.12"
 
 nvcomp_version:
-  - "=2.6.1"
+  - "=3.0.4"
 
 zlib_version:
   - ">=1.2.13"
diff --git a/dependencies.yaml b/dependencies.yaml
index 35d08239a4c..3850347aa63 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -244,7 +244,7 @@ dependencies:
           - libarrow-all==14.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
-          - nvcomp==2.6.1
+          - nvcomp==3.0.4
           - spdlog>=1.11.0,<1.12
   build_wheels:
     common:

From 87d2a36f04f431a8c5236d2aee723ec79b9dc5f9 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Thu, 9 Nov 2023 16:54:25 -0800
Subject: [PATCH 02/17] Remove Cython libcpp wrappers (#14382)

All of these wrappers have now been upstreamed into Cython as of Cython 3.0.3.

Contributes to #14023

Authors:
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - GALI PREM SAGAR (https://github.com/galipremsagar)
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)

URL: https://github.com/rapidsai/cudf/pull/14382
---
 .../all_cuda-118_arch-x86_64.yaml             |  2 +-
 .../all_cuda-120_arch-x86_64.yaml             |  2 +-
 conda/recipes/cudf/meta.yaml                  |  2 +-
 conda/recipes/cudf_kafka/meta.yaml            |  2 +-
 dependencies.yaml                             |  2 +-
 python/cudf/cudf/_lib/column.pyx              |  3 +-
 python/cudf/cudf/_lib/concat.pyx              |  3 +-
 python/cudf/cudf/_lib/copying.pyx             |  5 +-
 python/cudf/cudf/_lib/cpp/copying.pxd         |  2 +-
 python/cudf/cudf/_lib/cpp/groupby.pxd         |  4 +-
 python/cudf/cudf/_lib/cpp/io/orc.pxd          |  2 +-
 python/cudf/cudf/_lib/cpp/io/parquet.pxd      |  4 +-
 python/cudf/cudf/_lib/cpp/io/timezone.pxd     |  2 +-
 python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd |  0
 python/cudf/cudf/_lib/cpp/libcpp/__init__.py  |  0
 .../cudf/cudf/_lib/cpp/libcpp/functional.pxd  |  7 ---
 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd   | 12 -----
 python/cudf/cudf/_lib/cpp/libcpp/optional.pxd | 50 -------------------
 python/cudf/cudf/_lib/expressions.pyx         |  3 +-
 python/cudf/cudf/_lib/groupby.pyx             |  3 +-
 python/cudf/cudf/_lib/join.pyx                |  3 +-
 python/cudf/cudf/_lib/null_mask.pyx           |  3 +-
 python/cudf/cudf/_lib/parquet.pyx             |  3 +-
 python/cudf/cudf/_lib/timezone.pyx            |  2 +-
 python/cudf/pyproject.toml                    |  2 +-
 python/cudf_kafka/cudf_kafka/_lib/kafka.pyx   |  3 +-
 python/cudf_kafka/pyproject.toml              |  2 +-
 27 files changed, 27 insertions(+), 101 deletions(-)
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/__init__.py
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
 delete mode 100644 python/cudf/cudf/_lib/cpp/libcpp/optional.pxd

diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index a479d517c24..adf4fcad32d 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,7 +24,7 @@ dependencies:
 - cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.3
 - dask-core>=2023.9.2
 - dask-cuda==23.12.*
 - dask>=2023.9.2
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index d1779aaeeac..a69ef587570 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -26,7 +26,7 @@ dependencies:
 - cuda-version=12.0
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=3.0.0
+- cython>=3.0.3
 - dask-core>=2023.9.2
 - dask-cuda==23.12.*
 - dask>=2023.9.2
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 9b5c5f3d14b..27edde1c98a 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -57,7 +57,7 @@ requirements:
   host:
     - protobuf ==4.24.*
     - python
-    - cython >=3.0.0
+    - cython >=3.0.3
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index cdc547b4d68..9440f8bf124 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -51,7 +51,7 @@ requirements:
     {% endif %}
   host:
     - python
-    - cython >=3.0.0
+    - cython >=3.0.3
     - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
diff --git a/dependencies.yaml b/dependencies.yaml
index 3850347aa63..a16b51f4483 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -256,7 +256,7 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=3.0.0
+          - cython>=3.0.3
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
       - output_types: [conda, requirements, pyproject]
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f751d73b142..0edf9f8aa95 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -24,7 +24,7 @@ from cudf.utils.dtypes import _get_base_dtype
 
 from cpython.buffer cimport PyObject_CheckBuffer
 from libc.stdint cimport uintptr_t
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -47,7 +47,6 @@ from cudf._lib.cpp.column.column_factories cimport (
     make_numeric_column,
 )
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index feaf75ef237..1ec4719631e 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -12,7 +12,6 @@ from cudf._lib.cpp.concatenate cimport (
     concatenate_masks as libcudf_concatenate_masks,
     concatenate_tables as libcudf_concatenate_tables,
 )
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.utils cimport (
     data_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index f57bc15ed57..ea6ee76c14a 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -24,12 +24,13 @@ from cudf._lib.utils cimport table_view_from_columns, table_view_from_table
 from cudf._lib.reduce import minmax
 from cudf.core.abc import Serializable
 
+from libcpp.functional cimport reference_wrapper
+from libcpp.memory cimport make_unique
+
 cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split
 cimport cudf._lib.cpp.copying as cpp_copying
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index 20725c252fc..5637b55ac1c 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -2,6 +2,7 @@
 
 from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
@@ -9,7 +10,6 @@ from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd
index 2ecdf76842f..0266404fc50 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/cpp/groupby.pxd
@@ -1,6 +1,7 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
@@ -11,7 +12,6 @@ from cudf._lib.cpp.aggregation cimport (
 )
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index dd6f919a74d..d5ac8574fe4 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -4,12 +4,12 @@ from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index a6a7ba034aa..cdd1bde0274 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -2,16 +2,16 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
 from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd
index ba481d9a1d3..927c2118473 100644
--- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd
@@ -2,9 +2,9 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.table.table cimport table
 
 
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd b/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.py b/python/cudf/cudf/_lib/cpp/libcpp/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
deleted file mode 100644
index f3e2d6d0878..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-
-cdef extern from "<functional>" namespace "std" nogil:
-    cdef cppclass reference_wrapper[T]:
-        reference_wrapper()
-        reference_wrapper(T)
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
deleted file mode 100644
index 2178f1a940c..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-
-cdef extern from "<memory>" namespace "std" nogil:
-    # The Cython standard header does not have except +, so C++
-    # exceptions from make_unique are not caught and translated to
-    # Python ones. This is not perfectly ergonomic, we always have to
-    # wrap make_unique in move, but at least we can catch exceptions.
-    # See https://github.com/cython/cython/issues/5560
-    unique_ptr[T] make_unique[T](...) except +
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
deleted file mode 100644
index a78c18f3f7a..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
+++ /dev/null
@@ -1,50 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION &
-# AFFILIATES. All rights reserved.  SPDX-License-Identifier:
-# Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from libcpp cimport bool
-
-
-cdef extern from "<optional>" namespace "std" nogil:
-    cdef cppclass nullopt_t:
-        nullopt_t()
-
-    cdef nullopt_t nullopt
-
-    cdef cppclass optional[T]:
-        ctypedef T value_type
-        optional()
-        optional(nullopt_t)
-        optional(optional&) except +
-        optional(T&) except +
-        bool has_value()
-        T& value()
-        T& value_or[U](U& default_value)
-        void swap(optional&)
-        void reset()
-        T& emplace(...)
-        T& operator*()
-        optional& operator=(optional&)
-        optional& operator=[U](U&)
-        bool operator bool()
-        bool operator!()
-        bool operator==[U](optional&, U&)
-        bool operator!=[U](optional&, U&)
-        bool operator<[U](optional&, U&)
-        bool operator>[U](optional&, U&)
-        bool operator<=[U](optional&, U&)
-        bool operator>=[U](optional&, U&)
-
-    optional[T] make_optional[T](...) except +
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index 8d7545ffe15..01a080f635f 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -4,12 +4,11 @@ from enum import Enum
 
 from cython.operator cimport dereference
 from libc.stdint cimport int64_t
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp cimport expressions as libcudf_exp
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.types cimport size_type
 
 # Necessary for proper casting, see below.
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index a26d820de6f..b3778e45cde 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -24,6 +24,8 @@ from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.scalar import as_device_scalar
 
+from libcpp.functional cimport reference_wrapper
+
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.aggregation cimport (
@@ -33,7 +35,6 @@ from cudf._lib.aggregation cimport (
     make_groupby_scan_aggregation,
 )
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table, table_view
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 416680aae24..378be978cc0 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -2,7 +2,7 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
@@ -11,7 +11,6 @@ from rmm._lib.device_buffer cimport device_buffer
 cimport cudf._lib.cpp.join as cpp_join
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.utils cimport table_view_from_columns
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 5b4538629f6..1f98140d9e4 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -6,13 +6,12 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d8d363686cc..4acb1ce10b1 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -32,7 +32,7 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
@@ -52,7 +52,6 @@ from cudf._lib.cpp.io.parquet cimport (
     write_parquet as parquet_writer,
 )
 from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
 from cudf._lib.io.datasource cimport NativeFileDatasource
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index 4d76cbfcdb5..808d1321b0b 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,13 +1,13 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport make_optional
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.timezone cimport (
     make_timezone_transition_table as cpp_make_timezone_transition_table,
 )
-from cudf._lib.cpp.libcpp.optional cimport make_optional
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.utils cimport columns_from_unique_ptr
 
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 1c687269e55..b38970271d7 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "setuptools.build_meta"
 requires = [
     "cmake>=3.26.4",
-    "cython>=3.0.0",
+    "cython>=3.0.3",
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 4d732478723..2fbaacff7c6 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -3,12 +3,11 @@
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool, nullptr
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.datasource cimport datasource
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index f5cbd480e9c..4829f06ab09 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -3,7 +3,7 @@
 [build-system]
 
 requires = [
-    "cython>=3.0.0",
+    "cython>=3.0.3",
     "numpy>=1.21,<1.25",
     "pyarrow==14.0.1.*",
     "setuptools",

From 04d13d81b0bb4c2b3db2bfc9d9e28432e0a73c44 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Mon, 13 Nov 2023 09:05:19 -0500
Subject: [PATCH 03/17] Normalizing offsets iterator (#14234)

Creates a normalizing offsets iterator that returns an int64 value given either a int32 or int64 column data.
Depends on #14206

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Divye Gala (https://github.com/divyegala)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14234
---
 .../cudf/column/column_device_view.cuh        |   8 +-
 cpp/include/cudf/detail/indexalator.cuh       | 151 ++++++++++++++--
 .../cudf/detail/normalizing_iterator.cuh      | 160 +----------------
 cpp/include/cudf/detail/offsets_iterator.cuh  | 165 ++++++++++++++++++
 .../cudf/detail/offsets_iterator_factory.cuh  |  47 +++++
 cpp/tests/CMakeLists.txt                      |   1 +
 cpp/tests/iterator/indexalator_test.cu        |  37 ----
 cpp/tests/iterator/offsetalator_test.cu       | 140 +++++++++++++++
 8 files changed, 502 insertions(+), 207 deletions(-)
 create mode 100644 cpp/include/cudf/detail/offsets_iterator.cuh
 create mode 100644 cpp/include/cudf/detail/offsets_iterator_factory.cuh
 create mode 100644 cpp/tests/iterator/offsetalator_test.cu

diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 35851a99822..b1ff0bbaea7 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/alignment.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -442,10 +443,11 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   __device__ T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
-    auto const* d_offsets = d_children[strings_column_view::offsets_column_index].data<int32_t>();
     char const* d_strings = d_children[strings_column_view::chars_column_index].data<char>();
-    size_type offset      = d_offsets[index];
-    return string_view{d_strings + offset, d_offsets[index + 1] - offset};
+    auto const offsets    = d_children[strings_column_view::offsets_column_index];
+    auto const itr        = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
+    auto const offset     = itr[index];
+    return string_view{d_strings + offset, static_cast<cudf::size_type>(itr[index + 1] - offset)};
   }
 
  private:
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 6532dae3695..4d261c54b29 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -56,10 +56,69 @@ namespace detail {
  *  auto result = thrust::find(thrust::device, begin, end, size_type{12} );
  * @endcode
  */
-using input_indexalator = input_normalator<cudf::size_type>;
+struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
+  friend struct base_normalator<input_indexalator, cudf::size_type>;  // for CRTP
+
+  using reference = cudf::size_type const;  // this keeps STL and thrust happy
+
+  input_indexalator()                                    = default;
+  input_indexalator(input_indexalator const&)            = default;
+  input_indexalator(input_indexalator&&)                 = default;
+  input_indexalator& operator=(input_indexalator const&) = default;
+  input_indexalator& operator=(input_indexalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline cudf::size_type operator*() const { return operator[](0); }
+
+  /**
+   * @brief Dispatch functor for resolving a Integer value from any integer type
+   */
+  struct normalize_type {
+    template <typename T, CUDF_ENABLE_IF(cudf::is_index_type<T>())>
+    __device__ cudf::size_type operator()(void const* tp)
+    {
+      return static_cast<cudf::size_type>(*static_cast<T const*>(tp));
+    }
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_index_type<T>())>
+    __device__ cudf::size_type operator()(void const*)
+    {
+      CUDF_UNREACHABLE("only integral types are supported");
+    }
+  };
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a `Integer` value.
+   */
+  __device__ inline cudf::size_type operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return type_dispatcher(this->dtype_, normalize_type{}, tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data   Pointer to an integer array in device memory.
+   * @param dtype  Type of data in data
+   * @param offset Applied to the data pointer per size of the type
+   */
+  CUDF_HOST_DEVICE input_indexalator(void const* data, data_type dtype, cudf::size_type offset = 0)
+    : base_normalator<input_indexalator, cudf::size_type>(dtype), p_{static_cast<char const*>(data)}
+  {
+    p_ += offset * this->width_;
+  }
+
+ protected:
+  char const* p_;  /// pointer to the integer data in device memory
+};
 
 /**
- * @brief The index normalizing output iterator.
+ * @brief The index normalizing output iterator
  *
  * This is an iterator that can be used for index types (integers) without
  * requiring a type-specific instance. It can be used for any iterator
@@ -82,7 +141,75 @@ using input_indexalator = input_normalator<cudf::size_type>;
  *                      thrust::less<Element>());
  * @endcode
  */
-using output_indexalator = output_normalator<cudf::size_type>;
+struct output_indexalator : base_normalator<output_indexalator, cudf::size_type> {
+  friend struct base_normalator<output_indexalator, cudf::size_type>;  // for CRTP
+
+  using reference = output_indexalator const&;  // required for output iterators
+
+  output_indexalator()                                     = default;
+  output_indexalator(output_indexalator const&)            = default;
+  output_indexalator(output_indexalator&&)                 = default;
+  output_indexalator& operator=(output_indexalator const&) = default;
+  output_indexalator& operator=(output_indexalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(Integer)` calls.
+   */
+  __device__ inline reference operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(Integer)` call in this class.
+   */
+  __device__ inline output_indexalator const operator[](size_type idx) const
+  {
+    output_indexalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Dispatch functor for setting the index value from a size_type value.
+   */
+  struct normalize_type {
+    template <typename T, CUDF_ENABLE_IF(cudf::is_index_type<T>())>
+    __device__ void operator()(void* tp, cudf::size_type const value)
+    {
+      (*static_cast<T*>(tp)) = static_cast<T>(value);
+    }
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_index_type<T>())>
+    __device__ void operator()(void*, cudf::size_type const)
+    {
+      CUDF_UNREACHABLE("only index types are supported");
+    }
+  };
+
+  /**
+   * @brief Assign an Integer value to the current iterator position
+   */
+  __device__ inline reference operator=(cudf::size_type const value) const
+  {
+    void* tp = p_;
+    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
+    return *this;
+  }
+
+  /**
+   * @brief Create an output normalizing iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE output_indexalator(void* data, data_type dtype)
+    : base_normalator<output_indexalator, cudf::size_type>(dtype), p_{static_cast<char*>(data)}
+  {
+  }
+
+ protected:
+  char* p_;  /// pointer to the integer data in device memory
+};
 
 /**
  * @brief Use this class to create an indexalator instance.
@@ -92,14 +219,12 @@ struct indexalator_factory {
    * @brief A type_dispatcher functor to create an input iterator from an indices column.
    */
   struct input_indexalator_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     input_indexalator operator()(column_view const& indices)
     {
       return input_indexalator(indices.data<IndexType>(), indices.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     input_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("indices must be an index type");
@@ -110,16 +235,14 @@ struct indexalator_factory {
    * @brief Use this class to create an indexalator to a scalar index.
    */
   struct input_indexalator_scalar_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     input_indexalator operator()(scalar const& index)
     {
       // note: using static_cast<scalar_type_t<IndexType> const&>(index) creates a copy
       auto const scalar_impl = static_cast<scalar_type_t<IndexType> const*>(&index);
       return input_indexalator(scalar_impl->data(), index.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     input_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("scalar must be an index type");
@@ -130,14 +253,12 @@ struct indexalator_factory {
    * @brief A type_dispatcher functor to create an output iterator from an indices column.
    */
   struct output_indexalator_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     output_indexalator operator()(mutable_column_view const& indices)
     {
       return output_indexalator(indices.data<IndexType>(), indices.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     output_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("indices must be an index type");
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
index 35a695d47df..8f90afc3e57 100644
--- a/cpp/include/cudf/detail/normalizing_iterator.cuh
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -33,7 +33,7 @@ namespace detail {
  * @tparam Integer The type the iterator normalizes to
  */
 template <class Derived, typename Integer>
-struct base_normalator {
+struct alignas(16) base_normalator {
   static_assert(cudf::is_index_type<Integer>());
   using difference_type   = std::ptrdiff_t;
   using value_type        = Integer;
@@ -204,7 +204,7 @@ struct base_normalator {
 
  private:
   struct integer_sizeof_fn {
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_fixed_width<T>())>
     CUDF_HOST_DEVICE constexpr std::size_t operator()() const
     {
 #ifndef __CUDA_ARCH__
@@ -213,7 +213,7 @@ struct base_normalator {
       CUDF_UNREACHABLE("only integral types are supported");
 #endif
     }
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
+    template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
     CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept
     {
       return sizeof(T);
@@ -229,160 +229,16 @@ struct base_normalator {
     width_ = static_cast<int32_t>(type_dispatcher(dtype, integer_sizeof_fn{}));
   }
 
-  int32_t width_;    /// integer type width = 1,2,4, or 8
-  data_type dtype_;  /// for type-dispatcher calls
-};
-
-/**
- * @brief The integer normalizing input iterator
- *
- * This is an iterator that can be used for index types (integers) without
- * requiring a type-specific instance. It can be used for any iterator
- * interface for reading an array of integer values of type
- * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
- * Reading specific elements always return a type of `Integer`
- *
- * @tparam Integer Type returned by all read functions
- */
-template <typename Integer>
-struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
-  friend struct base_normalator<input_normalator<Integer>, Integer>;  // for CRTP
-
-  using reference = Integer const;  // this keeps STL and thrust happy
-
-  input_normalator()                                   = default;
-  input_normalator(input_normalator const&)            = default;
-  input_normalator(input_normalator&&)                 = default;
-  input_normalator& operator=(input_normalator const&) = default;
-  input_normalator& operator=(input_normalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns the value at the current iterator position
-   */
-  __device__ inline Integer operator*() const { return operator[](0); }
-
-  /**
-   * @brief Dispatch functor for resolving a Integer value from any integer type
-   */
-  struct normalize_type {
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
-    __device__ Integer operator()(void const* tp)
-    {
-      return static_cast<Integer>(*static_cast<T const*>(tp));
-    }
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
-    __device__ Integer operator()(void const*)
-    {
-      CUDF_UNREACHABLE("only integral types are supported");
-    }
-  };
-
   /**
-   * @brief Array subscript operator returns a value at the input
-   * `idx` position as a `Integer` value.
-   */
-  __device__ inline Integer operator[](size_type idx) const
-  {
-    void const* tp = p_ + (idx * this->width_);
-    return type_dispatcher(this->dtype_, normalize_type{}, tp);
-  }
-
-  /**
-   * @brief Create an input index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param data_type Type of data in data
-   */
-  CUDF_HOST_DEVICE input_normalator(void const* data, data_type dtype, cudf::size_type offset = 0)
-    : base_normalator<input_normalator<Integer>, Integer>(dtype), p_{static_cast<char const*>(data)}
-  {
-    p_ += offset * this->width_;
-  }
-
-  char const* p_;  /// pointer to the integer data in device memory
-};
-
-/**
- * @brief The integer normalizing output iterator
- *
- * This is an iterator that can be used for index types (integers) without
- * requiring a type-specific instance. It can be used for any iterator
- * interface for writing an array of integer values of type
- * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
- * Setting specific elements always accept the `Integer` type values.
- *
- * @tparam Integer The type used for all write functions
- */
-template <typename Integer>
-struct output_normalator : base_normalator<output_normalator<Integer>, Integer> {
-  friend struct base_normalator<output_normalator<Integer>, Integer>;  // for CRTP
-
-  using reference = output_normalator const&;  // required for output iterators
-
-  output_normalator()                                    = default;
-  output_normalator(output_normalator const&)            = default;
-  output_normalator(output_normalator&&)                 = default;
-  output_normalator& operator=(output_normalator const&) = default;
-  output_normalator& operator=(output_normalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns this iterator instance in order
-   * to capture the `operator=(Integer)` calls.
-   */
-  __device__ inline output_normalator const& operator*() const { return *this; }
-
-  /**
-   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
-   *
-   * This allows capturing the subsequent `operator=(Integer)` call in this class.
-   */
-  __device__ inline output_normalator const operator[](size_type idx) const
-  {
-    output_normalator tmp{*this};
-    tmp.p_ += (idx * this->width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Dispatch functor for setting the index value from a size_type value.
-   */
-  struct normalize_type {
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, Integer const value)
-    {
-      (*static_cast<T*>(tp)) = static_cast<T>(value);
-    }
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void*, Integer const)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-
-  /**
-   * @brief Assign an Integer value to the current iterator position
-   */
-  __device__ inline output_normalator const& operator=(Integer const value) const
-  {
-    void* tp = p_;
-    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
-    return *this;
-  }
-
-  /**
-   * @brief Create an output normalizing iterator
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param data_type Type of data in data
+   * @brief Constructor assigns width and type member variables for base class.
    */
-  CUDF_HOST_DEVICE output_normalator(void* data, data_type dtype)
-    : base_normalator<output_normalator<Integer>, Integer>(dtype), p_{static_cast<char*>(data)}
+  explicit CUDF_HOST_DEVICE base_normalator(data_type dtype, int32_t width)
+    : width_(width), dtype_(dtype)
   {
   }
 
-  char* p_;  /// pointer to the integer data in device memory
+  int32_t width_;    /// integer type width = 1,2,4, or 8
+  data_type dtype_;  /// for type-dispatcher calls
 };
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh
new file mode 100644
index 00000000000..3eb77b32353
--- /dev/null
+++ b/cpp/include/cudf/detail/offsets_iterator.cuh
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/normalizing_iterator.cuh>
+#include <cudf/types.hpp>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief The offsets normalizing input iterator
+ *
+ * This is an iterator that can be used for offsets where the underlying
+ * type may be int32_t or int64_t.
+ *
+ * Use the offsetalator_factory to create an appropriate input iterator
+ * from an offsets column_view.
+ */
+struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
+  friend struct base_normalator<input_offsetalator, int64_t>;  // for CRTP
+
+  using reference = int64_t const;  // this keeps STL and thrust happy
+
+  input_offsetalator()                                     = default;
+  input_offsetalator(input_offsetalator const&)            = default;
+  input_offsetalator(input_offsetalator&&)                 = default;
+  input_offsetalator& operator=(input_offsetalator const&) = default;
+  input_offsetalator& operator=(input_offsetalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline int64_t operator*() const { return operator[](0); }
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a int64_t value.
+   */
+  __device__ inline int64_t operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return this->width_ == sizeof(int32_t) ? static_cast<int64_t>(*static_cast<int32_t const*>(tp))
+                                           : *static_cast<int64_t const*>(tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator.
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype)
+    : base_normalator<input_offsetalator, int64_t>(
+        dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
+      p_{static_cast<char const*>(data)}
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64,
+                 "Unexpected offsets type");
+#else
+    cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
+                "Unexpected offsets type");
+#endif
+  }
+
+ protected:
+  char const* p_;  /// pointer to the integer data in device memory
+};
+
+/**
+ * @brief The offsets normalizing output iterator
+ *
+ * This is an iterator that can be used for storing offsets values
+ * where the underlying type may be either int32_t or int64_t.
+ *
+ * Use the offsetalator_factory to create an appropriate output iterator
+ * from a mutable_column_view.
+ *
+ */
+struct output_offsetalator : base_normalator<output_offsetalator, int64_t> {
+  friend struct base_normalator<output_offsetalator, int64_t>;  // for CRTP
+
+  using reference = output_offsetalator const&;  // required for output iterators
+
+  output_offsetalator()                                      = default;
+  output_offsetalator(output_offsetalator const&)            = default;
+  output_offsetalator(output_offsetalator&&)                 = default;
+  output_offsetalator& operator=(output_offsetalator const&) = default;
+  output_offsetalator& operator=(output_offsetalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(int64)` calls.
+   */
+  __device__ inline output_offsetalator const& operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(int64)` call in this class.
+   */
+  __device__ inline output_offsetalator const operator[](size_type idx) const
+  {
+    output_offsetalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Assign an offset value to the current iterator position
+   */
+  __device__ inline output_offsetalator const& operator=(int64_t const value) const
+  {
+    void* tp = p_;
+    if (this->width_ == sizeof(int32_t)) {
+      (*static_cast<int32_t*>(tp)) = static_cast<int32_t>(value);
+    } else {
+      (*static_cast<int64_t*>(tp)) = value;
+    }
+    return *this;
+  }
+
+  /**
+   * @brief Create an output offsets iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE output_offsetalator(void* data, data_type dtype)
+    : base_normalator<output_offsetalator, int64_t>(
+        dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
+      p_{static_cast<char*>(data)}
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64,
+                 "Unexpected offsets type");
+#else
+    cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
+                "Unexpected offsets type");
+#endif
+  }
+
+ protected:
+  char* p_;  /// pointer to the integer data in device memory
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/offsets_iterator_factory.cuh b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
new file mode 100644
index 00000000000..5b4c6b825d2
--- /dev/null
+++ b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Use this class to create an offsetalator instance.
+ */
+struct offsetalator_factory {
+  /**
+   * @brief Create an input offsetalator instance from an offsets column
+   */
+  static input_offsetalator make_input_iterator(column_view const& offsets)
+  {
+    return input_offsetalator(offsets.head(), offsets.type());
+  }
+
+  /**
+   * @brief Create an output offsetalator instance from an offsets column
+   */
+  static output_offsetalator make_output_iterator(mutable_column_view const& offsets)
+  {
+    return output_offsetalator(offsets.head(), offsets.type());
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b0382d15807..7b628649051 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -393,6 +393,7 @@ set_tests_properties(
 ConfigureTest(
   ITERATOR_TEST
   iterator/indexalator_test.cu
+  iterator/offsetalator_test.cu
   iterator/optional_iterator_test_chrono.cu
   iterator/optional_iterator_test_numeric.cu
   iterator/pair_iterator_test_chrono.cu
diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu
index 3e8bcd5cb0d..0c10853ec02 100644
--- a/cpp/tests/iterator/indexalator_test.cu
+++ b/cpp/tests/iterator/indexalator_test.cu
@@ -157,40 +157,3 @@ TYPED_TEST(IndexalatorTest, output_iterator)
   expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 1, 1, 2, 3, 4, 5, 5, 7});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
 }
-
-/**
- * For testing creating and using the indexalator in device code.
- */
-struct device_functor_fn {
-  cudf::column_device_view const d_col;
-  __device__ cudf::size_type operator()(cudf::size_type idx)
-  {
-    auto itr = cudf::detail::input_indexalator(d_col.head(), d_col.type());
-    return itr[idx] * 3;
-  }
-};
-
-TYPED_TEST(IndexalatorTest, device_indexalator)
-{
-  using T = TypeParam;
-
-  auto d_col1 =
-    cudf::test::fixed_width_column_wrapper<T, int32_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
-  auto d_col2 =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 0, 0, 0, 0, 0, 0, 0, 0});
-  auto input  = cudf::column_view(d_col1);
-  auto output = cudf::mutable_column_view(d_col2);
-  auto stream = cudf::get_default_stream();
-
-  auto d_input = cudf::column_device_view::create(input, stream);
-
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::counting_iterator<int>(0),
-                    thrust::counting_iterator<int>(input.size()),
-                    output.begin<cudf::size_type>(),
-                    device_functor_fn{*d_input});
-
-  auto expected =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 18, 21, 42, 69, 99, 129, 135, 189});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
-}
diff --git a/cpp/tests/iterator/offsetalator_test.cu b/cpp/tests/iterator/offsetalator_test.cu
new file mode 100644
index 00000000000..e569e58f42a
--- /dev/null
+++ b/cpp/tests/iterator/offsetalator_test.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+#include <tests/iterator/iterator_tests.cuh>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/offsets_iterator_factory.cuh>
+
+#include <thrust/binary_search.h>
+#include <thrust/gather.h>
+#include <thrust/host_vector.h>
+#include <thrust/optional.h>
+#include <thrust/pair.h>
+#include <thrust/scatter.h>
+#include <thrust/sequence.h>
+#include <thrust/transform.h>
+
+using TestingTypes = cudf::test::Types<int32_t, int64_t>;
+
+template <typename T>
+struct OffsetalatorTest : public IteratorTest<T> {};
+
+TYPED_TEST_SUITE(OffsetalatorTest, TestingTypes);
+
+TYPED_TEST(OffsetalatorTest, input_iterator)
+{
+  using T = TypeParam;
+
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+
+  auto d_col = cudf::test::fixed_width_column_wrapper<T>(host_values.begin(), host_values.end());
+
+  auto expected_values = thrust::host_vector<cudf::size_type>(host_values.size());
+  std::transform(host_values.begin(), host_values.end(), expected_values.begin(), [](auto v) {
+    return static_cast<cudf::size_type>(v);
+  });
+
+  auto it_dev = cudf::detail::offsetalator_factory::make_input_iterator(d_col);
+  this->iterator_test_thrust(expected_values, it_dev, host_values.size());
+}
+
+TYPED_TEST(OffsetalatorTest, output_iterator)
+{
+  using T = TypeParam;
+
+  auto d_col1 = cudf::test::fixed_width_column_wrapper<int64_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 = cudf::test::fixed_width_column_wrapper<T>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto itr    = cudf::detail::offsetalator_factory::make_output_iterator(d_col2);
+  auto input  = cudf::column_view(d_col1);
+  auto stream = cudf::get_default_stream();
+
+  auto map   = cudf::test::fixed_width_column_wrapper<int>({0, 2, 4, 6, 8, 1, 3, 5, 7});
+  auto d_map = cudf::column_view(map);
+  thrust::gather(rmm::exec_policy_nosync(stream),
+                 d_map.begin<int>(),
+                 d_map.end<int>(),
+                 input.begin<int64_t>(),
+                 itr);
+  auto expected = cudf::test::fixed_width_column_wrapper<T>({0, 7, 23, 43, 63, 6, 14, 33, 45});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::scatter(rmm::exec_policy_nosync(stream),
+                  input.begin<int64_t>(),
+                  input.end<int64_t>(),
+                  d_map.begin<int>(),
+                  itr);
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 33, 6, 43, 7, 45, 14, 63, 23});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::fill(rmm::exec_policy(stream), itr, itr + input.size(), 77);
+  expected = cudf::test::fixed_width_column_wrapper<T>({77, 77, 77, 77, 77, 77, 77, 77, 77});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::sequence(rmm::exec_policy(stream), itr, itr + input.size());
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 1, 2, 3, 4, 5, 6, 7, 8});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<int64_t>({0, 10, 20, 30, 40, 50, 60, 70, 80});
+  auto d_offsets = cudf::column_view(offsets);
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      d_offsets.begin<int64_t>(),
+                      d_offsets.end<int64_t>(),
+                      input.begin<int64_t>(),
+                      input.end<int64_t>(),
+                      itr);
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 1, 1, 2, 3, 4, 5, 5, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}
+
+namespace {
+/**
+ * For testing creating and using the offsetalator in device code.
+ */
+struct device_functor_fn {
+  cudf::column_device_view const d_col;
+  __device__ int32_t operator()(int idx)
+  {
+    auto const itr = cudf::detail::input_offsetalator(d_col.head(), d_col.type());
+    return static_cast<int32_t>(itr[idx] * 3);
+  }
+};
+}  // namespace
+
+TYPED_TEST(OffsetalatorTest, device_offsetalator)
+{
+  using T = TypeParam;
+
+  auto d_col1 = cudf::test::fixed_width_column_wrapper<T>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto input  = cudf::column_view(d_col1);
+  auto output = cudf::mutable_column_view(d_col2);
+  auto stream = cudf::get_default_stream();
+
+  auto d_input = cudf::column_device_view::create(input, stream);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::counting_iterator<int>(0),
+                    thrust::counting_iterator<int>(input.size()),
+                    output.begin<int32_t>(),
+                    device_functor_fn{*d_input});
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<int32_t>({0, 18, 21, 42, 69, 99, 129, 135, 189});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}

From 4313cfa9b3fcff41f67b48ac8797dc015d441ecc Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Mon, 13 Nov 2023 11:40:36 -0800
Subject: [PATCH 04/17] Use new rapids-dask-dependency metapackage for managing
 dask versions (#14364)

* Update dependency lists

* Update wheel building to stop needing manual installations

* Update wheel dependency with alpha spec

* Rename the package

* Update update-version.sh

* Update conda/recipes/dask-cudf/meta.yaml

Co-authored-by: GALI PREM SAGAR <sagarprem75@gmail.com>

* Make pip/conda dependencies consistent and fix recipe

* dfg

* Apply suggestions from code review

---------

Co-authored-by: GALI PREM SAGAR <sagarprem75@gmail.com>
---
 ci/build_wheel.sh                                | 2 ++
 ci/release/update-version.sh                     | 1 +
 ci/test_wheel_dask_cudf.sh                       | 3 ---
 conda/environments/all_cuda-118_arch-x86_64.yaml | 4 +---
 conda/environments/all_cuda-120_arch-x86_64.yaml | 4 +---
 conda/recipes/dask-cudf/meta.yaml                | 8 +-------
 dependencies.yaml                                | 4 +---
 python/dask_cudf/pyproject.toml                  | 3 +--
 8 files changed, 8 insertions(+), 21 deletions(-)

diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 08716cdb3d9..ae1d9c3fb1a 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -36,6 +36,8 @@ fi
 
 if [[ ${package_name} == "dask_cudf" ]]; then
     sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
 else
     sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
     # ptxcompiler and cubinlinker aren't version constrained
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 7574b4174e9..843abd3c3c1 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -80,6 +80,7 @@ DEPENDENCIES=(
   kvikio
   libkvikio
   librmm
+  rapids-dask-dependency
   rmm
 )
 for DEP in "${DEPENDENCIES[@]}"; do
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index 118bea753d0..e9162b816aa 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -23,9 +23,6 @@ manylinux="manylinux_${manylinux_version}"
 RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
-# Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-23.12
-
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index adf4fcad32d..9b85888a7b3 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -25,10 +25,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -80,6 +77,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rapids-dask-dependency==23.12.*
 - rich
 - rmm==23.12.*
 - s3fs>=2022.3.0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index a69ef587570..da2b4e109b3 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -27,10 +27,7 @@ dependencies:
 - cupy>=12.0.0
 - cxx-compiler
 - cython>=3.0.3
-- dask-core>=2023.9.2
 - dask-cuda==23.12.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -78,6 +75,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rapids-dask-dependency==23.12.*
 - rich
 - rmm==23.12.*
 - s3fs>=2022.3.0
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 9dc9f76d9f5..16638926492 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -37,17 +37,11 @@ build:
 requirements:
   host:
     - python
-    - cudf ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
     - cuda-version ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
+    - rapids-dask-dependency ={{ minor_version }}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/dependencies.yaml b/dependencies.yaml
index a16b51f4483..b971a682571 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -500,12 +500,10 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask>=2023.9.2
-          - distributed>=2023.9.2
+          - rapids-dask-dependency==23.12.*
       - output_types: conda
         packages:
           - cupy>=12.0.0
-          - dask-core>=2023.9.2  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
           - &cudf cudf==23.12.*
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 32c7bb9fd15..0306da3de46 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -20,11 +20,10 @@ requires-python = ">=3.9"
 dependencies = [
     "cudf==23.12.*",
     "cupy-cuda11x>=12.0.0",
-    "dask>=2023.9.2",
-    "distributed>=2023.9.2",
     "fsspec>=0.6.0",
     "numpy>=1.21,<1.25",
     "pandas>=1.3,<1.6.0dev0",
+    "rapids-dask-dependency==23.12.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",

From 5d09d38bc8ea44e1bdf1fa29e11a820c7417bac5 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 14 Nov 2023 00:51:42 -0500
Subject: [PATCH 05/17] Always build nvbench statically so we don't need to
 package it (#14399)

Corrects failures seen in C++ CI where libnvbench.so can't be found

Authors:
  - Robert Maynard (https://github.com/robertmaynard)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14399
---
 cpp/cmake/thirdparty/get_nvbench.cmake             | 2 +-
 cpp/cmake/thirdparty/patches/nvbench_override.json | 5 -----
 2 files changed, 1 insertion(+), 6 deletions(-)

diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake
index f0642145fa0..bbd22693ba4 100644
--- a/cpp/cmake/thirdparty/get_nvbench.cmake
+++ b/cpp/cmake/thirdparty/get_nvbench.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_nvbench)
   set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
   rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")
 
-  rapids_cpm_nvbench()
+  rapids_cpm_nvbench(BUILD_STATIC)
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index 7be868081b6..ad9b19c29c1 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -7,11 +7,6 @@
           "file" : "${current_json_dir}/nvbench_global_setup.diff",
           "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
           "fixed_in" : ""
-        },
-        {
-          "file" : "nvbench/use_existing_fmt.diff",
-          "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]",
-          "fixed_in" : ""
         }
       ]
     }

From e982d3736f095e680298af85bde732d9b5a73122 Mon Sep 17 00:00:00 2001
From: Ashwin Srinath <3190405+shwina@users.noreply.github.com>
Date: Tue, 14 Nov 2023 09:51:02 -0500
Subject: [PATCH 06/17] cudf.pandas: cuDF subpath checking in module
 `__getattr__` (#14388)

Closes https://github.com/rapidsai/cudf/issues/14384. `x.startswith(y)` is not a good enough check for if `x` is a subdirectory of `y`. It causes `pandasai` to be reported as a sub-package of `pandas`.

Authors:
  - Ashwin Srinath (https://github.com/shwina)

Approvers:
  - https://github.com/brandon-b-miller

URL: https://github.com/rapidsai/cudf/pull/14388
---
 python/cudf/cudf/pandas/module_accelerator.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index eb35c4adaaf..180d75d96e8 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -10,6 +10,7 @@
 import importlib.abc
 import importlib.machinery
 import os
+import pathlib
 import sys
 import threading
 import warnings
@@ -554,9 +555,10 @@ def getattr_real_or_wrapped(
             frame = sys._getframe()
             # We cannot possibly be at the top level.
             assert frame.f_back
-            calling_module = frame.f_back.f_code.co_filename
+            calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename)
             use_real = any(
-                calling_module.startswith(path) for path in loader._denylist
+                calling_module.is_relative_to(path)
+                for path in loader._denylist
             )
         try:
             if use_real:

From 7f3fba164c4dd28c701ea2941d0525fc782a639c Mon Sep 17 00:00:00 2001
From: Jeremy Dyer <jdye64@gmail.com>
Date: Tue, 14 Nov 2023 12:02:10 -0500
Subject: [PATCH 07/17] Refactor cudf_kafka to use skbuild (#14292)

Refactor the currently outdated cudf_kafka build setup to use skbuild instead.

Authors:
  - Jeremy Dyer (https://github.com/jdye64)
  - Bradley Dice (https://github.com/bdice)
  - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Bradley Dice (https://github.com/bdice)
  - AJ Schmidt (https://github.com/ajschmidt8)

URL: https://github.com/rapidsai/cudf/pull/14292
---
 build.sh                                      |  2 +-
 ci/release/update-version.sh                  |  1 +
 .../all_cuda-120_arch-x86_64.yaml             |  1 -
 conda/recipes/cudf_kafka/build.sh             | 13 ---
 .../cudf_kafka/conda_build_config.yaml        |  6 ++
 conda/recipes/cudf_kafka/meta.yaml            | 21 ++--
 cpp/libcudf_kafka/CMakeLists.txt              |  8 +-
 .../cmake/thirdparty/get_cudf.cmake           | 16 +--
 cpp/libcudf_kafka/tests/CMakeLists.txt        |  2 +-
 dependencies.yaml                             | 13 +--
 python/cudf/cudf/_lib/CMakeLists.txt          |  6 --
 python/cudf_kafka/CMakeLists.txt              | 47 +++++++++
 python/cudf_kafka/LICENSE                     |  1 +
 python/cudf_kafka/README.md                   |  1 +
 .../cudf_kafka/cudf_kafka/_lib/CMakeLists.txt | 62 ++++++++++++
 python/cudf_kafka/cudf_kafka/_lib/kafka.pxd   |  4 +-
 python/cudf_kafka/pyproject.toml              |  1 +
 python/cudf_kafka/setup.py                    | 97 ++-----------------
 18 files changed, 160 insertions(+), 142 deletions(-)
 create mode 100644 python/cudf_kafka/CMakeLists.txt
 create mode 120000 python/cudf_kafka/LICENSE
 create mode 120000 python/cudf_kafka/README.md
 create mode 100644 python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt

diff --git a/build.sh b/build.sh
index 2ad69712e5d..e5beb51dedf 100755
--- a/build.sh
+++ b/build.sh
@@ -369,7 +369,7 @@ fi
 # build cudf_kafka Python package
 if hasArg cudf_kafka; then
     cd ${REPODIR}/python/cudf_kafka
-    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \
+    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS}" \
         SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \
         python -m pip install --no-build-isolation --no-deps .
 fi
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 843abd3c3c1..4f1cbc47d1d 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -43,6 +43,7 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'
 
 # Python CMakeLists updates
 sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt
+sed_runner 's/'"cudf_kafka_version .*)"'/'"cudf_kafka_version ${NEXT_FULL_TAG})"'/g' python/cudf_kafka/CMakeLists.txt
 
 # cpp libcudf_kafka update
 sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index da2b4e109b3..a3eeb3dd99f 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -17,7 +17,6 @@ dependencies:
 - cmake>=3.26.4
 - cramjam
 - cuda-cudart-dev
-- cuda-gdb
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh
index f4bb6e1bc91..9458349d101 100644
--- a/conda/recipes/cudf_kafka/build.sh
+++ b/conda/recipes/cudf_kafka/build.sh
@@ -1,16 +1,3 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-# This assumes the script is executed from the root of the repo directory
-# Need to set CUDA_HOME inside conda environments because the hacked together
-# setup.py for cudf-kafka searches that way.
-# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-# cudf_kafka to use scikit-build
-CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*}
-if [[ ${CUDA_MAJOR} == "12" ]]; then
-    target_name="x86_64-linux"
-    if [[ ! $(arch) == "x86_64" ]]; then
-        target_name="sbsa-linux"
-    fi
-    export CUDA_HOME="${PREFIX}/targets/${target_name}/"
-fi
 ./build.sh -v cudf_kafka
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index b63a136ad2d..c98c2701653 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -9,3 +9,9 @@ sysroot_version:
 
 cmake_version:
   - ">=3.26.4"
+
+cuda_compiler:
+  - cuda-nvcc
+
+cuda11_compiler:
+  - nvcc
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index 9440f8bf124..343ec2519f1 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -33,28 +33,31 @@ build:
     - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64]
     - SCCACHE_S3_USE_SSL
     - SCCACHE_S3_NO_CREDENTIALS
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-    # cudf_kafka to use scikit-build
-    - RAPIDS_CUDA_VERSION
+  ignore_run_exports_from:
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }}
+    {% endif %}
 
 requirements:
   build:
     - cmake {{ cmake_version }}
+    - ninja
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
-    - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-    # cudf_kafka to use scikit-build
-    {% if cuda_major == "12" %}
-    - cuda-gdb
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    {% else %}
+    - {{ compiler('cuda') }}
     {% endif %}
+    - cuda-version ={{ cuda_version }}
+    - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
     - python
     - cython >=3.0.3
     - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
+    - scikit-build >=0.13.1
     - setuptools
     {% if cuda_major == "12" %}
     - cuda-cudart-dev
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 1a15a3ec2cd..4128afa3935 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -21,7 +21,7 @@ include(rapids-export)
 include(rapids-find)
 
 project(
-  CUDA_KAFKA
+  CUDF_KAFKA
   VERSION 23.12.00
   LANGUAGES CXX
 )
@@ -64,7 +64,7 @@ add_library(cudf_kafka SHARED src/kafka_consumer.cpp src/kafka_callback.cpp)
 # ##################################################################################################
 # * include paths ---------------------------------------------------------------------------------
 target_include_directories(
-  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDA_KAFKA_SOURCE_DIR}/include>"
+  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDF_KAFKA_SOURCE_DIR}/include>"
                     "$<INSTALL_INTERFACE:include>"
 )
 
@@ -85,6 +85,8 @@ set_target_properties(
              CXX_STANDARD_REQUIRED ON
 )
 
+add_library(cudf_kafka::cudf_kafka ALIAS cudf_kafka)
+
 # ##################################################################################################
 # * cudf_kafka Install ----------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)
@@ -94,7 +96,7 @@ install(
   EXPORT cudf_kafka-exports
 )
 
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include DESTINATION include)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 rapids_export(
   INSTALL cudf_kafka
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
index aa4c5b60e7a..20aa9873f43 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -35,21 +35,21 @@ function(find_and_configure_cudf VERSION)
   endif()
 endfunction()
 
-set(CUDA_KAFKA_MIN_VERSION_cudf
-    "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}"
+set(CUDF_KAFKA_MIN_VERSION
+    "${CUDF_KAFKA_VERSION_MAJOR}.${CUDF_KAFKA_VERSION_MINOR}.${CUDF_KAFKA_VERSION_PATCH}"
 )
-find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf})
+find_and_configure_cudf(${CUDF_KAFKA_MIN_VERSION})
 
 if(cudf_REQUIRES_CUDA)
-  rapids_cuda_init_architectures(CUDA_KAFKA)
+  rapids_cuda_init_architectures(CUDF_KAFKA)
 
   # Since we are building cudf as part of ourselves we need to enable the CUDA language in the
   # top-most scope
   enable_language(CUDA)
 
-  # Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that
+  # Since CUDF_KAFKA only enables CUDA optionally we need to manually include the file that
   # rapids_cuda_init_architectures relies on `project` calling
-  if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE)
-    include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}")
+  if(DEFINED CMAKE_PROJECT_CUDF_KAFKA_INCLUDE)
+    include("${CMAKE_PROJECT_CUDF_KAFKA_INCLUDE}")
   endif()
 endif()
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index 68a5327b455..b819cb6fc3b 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -26,7 +26,7 @@ function(ConfigureTest test_name)
   add_executable(${test_name} ${ARGN})
   set_target_properties(
     ${test_name}
-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>"
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_KAFKA_BINARY_DIR}/gtests>"
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
diff --git a/dependencies.yaml b/dependencies.yaml
index b971a682571..97149a5e2ba 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -9,8 +9,8 @@ files:
       - build_all
       - build_cpp
       - build_wheels
-      - build_python
       - build_python_common
+      - build_python_cudf
       - cudatoolkit
       - develop
       - docs
@@ -71,8 +71,8 @@ files:
       table: build-system
     includes:
       - build_all
-      - build_python
       - build_python_common
+      - build_python_cudf
       - build_wheels
   py_run_cudf:
     output: pyproject
@@ -138,8 +138,8 @@ files:
     extras:
       table: build-system
     includes:
-      - build_wheels
       - build_python_common
+      - build_wheels
   py_run_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
@@ -259,16 +259,16 @@ dependencies:
           - cython>=3.0.3
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
+          - scikit-build>=0.13.1
       - output_types: [conda, requirements, pyproject]
         packages:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==14.0.1.*
-  build_python:
+  build_python_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - scikit-build>=0.13.1
           - rmm==23.12.*
       - output_types: conda
         packages:
@@ -302,9 +302,6 @@ dependencies:
               - cuda-nvrtc-dev
               - cuda-nvtx-dev
               - libcurand-dev
-              # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-              # cudf_kafka to use scikit-build
-              - cuda-gdb
           - matrix:
               cuda: "11.8"
             packages:
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 1b543b94589..c041c7f4842 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -81,12 +81,6 @@ target_link_libraries(strings_udf cudf_strings_udf)
 # necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
 # /opt/_internal"
 find_package(NumPy REQUIRED)
-set(targets_using_numpy interop avro csv orc json parquet)
-foreach(target IN LISTS targets_using_numpy)
-  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
-  # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
-  # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
-endforeach()
 
 set(targets_using_dlpack interop)
 foreach(target IN LISTS targets_using_dlpack)
diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt
new file mode 100644
index 00000000000..d55c3fdc076
--- /dev/null
+++ b/python/cudf_kafka/CMakeLists.txt
@@ -0,0 +1,47 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+set(cudf_kafka_version 23.12.00)
+
+include(../../fetch_rapids.cmake)
+
+project(
+  cudf-kafka-python
+  VERSION ${cudf_kafka_version}
+  LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
+            # language to be enabled here. The test project that is built in scikit-build to verify
+            # various linking options for the python library is hardcoded to build with C, so until
+            # that is fixed we need to keep C.
+            C CXX
+)
+
+find_package(cudf_kafka ${cudf_kafka_version} REQUIRED)
+
+if(NOT cudf_kafka_FOUND)
+  message(
+    FATAL_ERROR
+      "cudf_kafka package not found. cudf_kafka C++ is required to build this Python package."
+  )
+endif()
+
+include(rapids-cython)
+rapids_cython_init()
+
+add_subdirectory(cudf_kafka/_lib)
+
+if(DEFINED cython_lib_dir)
+  rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}")
+endif()
diff --git a/python/cudf_kafka/LICENSE b/python/cudf_kafka/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cudf_kafka/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cudf_kafka/README.md b/python/cudf_kafka/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/cudf_kafka/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
new file mode 100644
index 00000000000..3262b7d5ebe
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
@@ -0,0 +1,62 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources kafka.pyx)
+set(linked_libraries cudf_kafka::cudf_kafka)
+
+rapids_cython_create_modules(
+  CXX ASSOCIATED_TARGETS cudf_kafka
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}"
+)
+
+# TODO: Finding NumPy currently requires finding Development due to a bug in CMake. This bug was
+# fixed in https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7410 and will be available in
+# CMake 3.24, so we can remove the Development component once we upgrade to CMake 3.24.
+# find_package(Python REQUIRED COMPONENTS Development NumPy)
+
+# Note: The bug noted above prevents us from finding NumPy successfully using FindPython.cmake
+# inside the manylinux images used to build wheels because manylinux images do not contain
+# libpython.so and therefore Development cannot be found. Until we upgrade to CMake 3.24, we should
+# use FindNumpy.cmake instead (provided by scikit-build). When we switch to 3.24 we can try
+# switching back, but it may not work if that implicitly still requires Python libraries. In that
+# case we'll need to follow up with the CMake team to remove that dependency.  The stopgap solution
+# is to unpack the static lib tarballs in the wheel building jobs so that there are at least static
+# libs to be found, but that should be a last resort since it implies a dependency that isn't really
+# necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
+# /opt/_internal"
+find_package(NumPy REQUIRED)
+
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
+  OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
+  ERROR_VARIABLE PYARROW_ERROR
+  RESULT_VARIABLE PYARROW_RESULT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+if(${PYARROW_RESULT})
+  message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
+endif()
+
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index ca729c62512..068837d04ee 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -11,12 +11,12 @@ from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.io.datasource cimport Datasource
 
 
-cdef extern from "kafka_callback.hpp" \
+cdef extern from "cudf_kafka/kafka_callback.hpp" \
         namespace "cudf::io::external::kafka" nogil:
     ctypedef object (*python_callable_type)()
 
 
-cdef extern from "kafka_consumer.hpp" \
+cdef extern from "cudf_kafka/kafka_consumer.hpp" \
         namespace "cudf::io::external::kafka" nogil:
 
     cpdef cppclass kafka_consumer:
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 4829f06ab09..15431161d75 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -6,6 +6,7 @@ requires = [
     "cython>=3.0.3",
     "numpy>=1.21,<1.25",
     "pyarrow==14.0.1.*",
+    "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index 6f3909d4528..6a99e9ed968 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -1,96 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-import os
-import shutil
-import sysconfig
-from distutils.sysconfig import get_python_lib
-
-import numpy as np
-import pyarrow as pa
-from Cython.Build import cythonize
-from setuptools import find_packages, setup
-from setuptools.extension import Extension
-
-cython_files = ["cudf_kafka/_lib/*.pyx"]
-
-CUDA_HOME = os.environ.get("CUDA_HOME", False)
-if not CUDA_HOME:
-    path_to_cuda_gdb = shutil.which("cuda-gdb")
-    if path_to_cuda_gdb is None:
-        raise OSError(
-            "Could not locate CUDA. "
-            "Please set the environment variable "
-            "CUDA_HOME to the path to the CUDA installation "
-            "and try again."
-        )
-    CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb))
-
-if not os.path.isdir(CUDA_HOME):
-    raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}")
-
-cuda_include_dir = os.path.join(CUDA_HOME, "include")
-
-CUDF_ROOT = os.environ.get(
-    "CUDF_ROOT",
-    os.path.abspath(
-        os.path.join(
-            os.path.dirname(os.path.abspath(__file__)), "../../cpp/build/"
-        )
-    ),
-)
-CUDF_KAFKA_ROOT = os.environ.get(
-    "CUDF_KAFKA_ROOT", "../../cpp/libcudf_kafka/build"
-)
-
-try:
-    nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0")
-except Exception:
-    nthreads = 0
-
-extensions = [
-    Extension(
-        "*",
-        sources=cython_files,
-        include_dirs=[
-            os.path.abspath(os.path.join(CUDF_ROOT, "../include/cudf")),
-            os.path.abspath(os.path.join(CUDF_ROOT, "../include")),
-            os.path.abspath(
-                os.path.join(CUDF_ROOT, "../libcudf_kafka/include/cudf_kafka")
-            ),
-            os.path.join(CUDF_ROOT, "include"),
-            os.path.join(CUDF_ROOT, "_deps/libcudacxx-src/include"),
-            os.path.join(
-                os.path.dirname(sysconfig.get_path("include")),
-                "rapids/libcudacxx",
-            ),
-            os.path.dirname(sysconfig.get_path("include")),
-            np.get_include(),
-            pa.get_include(),
-            cuda_include_dir,
-        ],
-        library_dirs=(
-            [
-                get_python_lib(),
-                os.path.join(os.sys.prefix, "lib"),
-                CUDF_KAFKA_ROOT,
-            ]
-        ),
-        libraries=["cudf", "cudf_kafka"],
-        language="c++",
-        extra_compile_args=["-std=c++17", "-DFMT_HEADER_ONLY=1"],
-    )
-]
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+from setuptools import find_packages
+from skbuild import setup
 
 packages = find_packages(include=["cudf_kafka*"])
+
 setup(
-    # Include the separately-compiled shared library
-    ext_modules=cythonize(
-        extensions,
-        nthreads=nthreads,
-        compiler_directives=dict(
-            profile=False, language_level=3, embedsignature=True
-        ),
-    ),
     packages=packages,
-    package_data={key: ["VERSION", "*.pxd"] for key in packages},
+    package_data={
+        key: ["VERSION", "*.pxd", "*.hpp", "*.cuh"] for key in packages
+    },
     zip_safe=False,
 )

From b0c1b7b82ccdf1a7e4159cb3bffa1984092440d4 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 14 Nov 2023 12:48:32 -0500
Subject: [PATCH 08/17] Add BytePairEncoder class to cuDF (#13891)

Adds a new BytePairEncoding class to cuDF
```
>>> import cudf
>>> from cudf.core.byte_pair_encoding import BytePairEncoder
>>> mps = cudf.read_text('merges.txt', delimiter='\n', strip_delimiters=True)
>>> bpe = BytePairEncoder(mps)
>>> str_series = cudf.Series(['This is a sentence', 'thisisit'])
>>> bpe(str_series)
0    This is a sent ence
1             this is it
dtype: object
```
This class wraps the existing `nvtext::byte_pair_encoding` APIs to load the merge-pairs data and encode a column of strings.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/13891
---
 .../cudf/_lib/cpp/nvtext/byte_pair_encode.pxd | 24 ++++++++
 python/cudf/cudf/_lib/nvtext/CMakeLists.txt   |  4 +-
 .../cudf/_lib/nvtext/byte_pair_encode.pyx     | 50 ++++++++++++++++
 python/cudf/cudf/core/byte_pair_encoding.py   | 59 +++++++++++++++++++
 .../cudf/cudf/tests/text/test_text_methods.py | 41 +++++++++++++
 5 files changed, 176 insertions(+), 2 deletions(-)
 create mode 100644 python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
 create mode 100644 python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
 create mode 100644 python/cudf/cudf/core/byte_pair_encoding.py

diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
new file mode 100644
index 00000000000..e678e4e84db
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
+
+    cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
+        pass
+
+    cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
+        const column_view &merge_pairs
+    ) except +
+
+    cdef unique_ptr[column] byte_pair_encoding(
+        const column_view &strings,
+        const bpe_merge_pairs &merge_pairs,
+        const string_scalar &separator
+    ) except +
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index d4e2392ee04..d7cbdeb5bda 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
+    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
 )
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
new file mode 100644
index 00000000000..cfc76afa8a5
--- /dev/null
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+from cudf.core.buffer import acquire_spill_lock
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
+    bpe_merge_pairs as cpp_bpe_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+    load_merge_pairs as cpp_load_merge_pairs,
+)
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.scalar cimport DeviceScalar
+
+
+cdef class BPEMergePairs:
+    cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
+
+    def __cinit__(self, Column merge_pairs):
+        cdef column_view c_pairs = merge_pairs.view()
+        with nogil:
+            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+
+
+@acquire_spill_lock()
+def byte_pair_encoding(
+    Column strings,
+    BPEMergePairs merge_pairs,
+    object separator
+):
+    cdef column_view c_strings = strings.view()
+    cdef DeviceScalar d_separator = separator.device_value
+    cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
+        .get_raw_ptr()
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_byte_pair_encoding(
+                c_strings,
+                merge_pairs.c_obj.get()[0],
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
new file mode 100644
index 00000000000..4c881022ecf
--- /dev/null
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import cudf
+from cudf._lib.nvtext.byte_pair_encode import (
+    BPEMergePairs as cpp_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+)
+
+
+class BytePairEncoder:
+    """
+    Given a merge pairs strings series, performs byte pair encoding on
+    a strings series using the provided separator.
+
+    Parameters
+    ----------
+    merges_pairs : str
+        Strings column of merge pairs
+
+    Returns
+    -------
+    BytePairEncoder
+    """
+
+    def __init__(self, merges_pair: "cudf.Series"):
+        self.merge_pairs = cpp_merge_pairs(merges_pair._column)
+
+    def __call__(self, text, separator: str = " "):
+        """
+
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be encoded.
+
+        Returns
+        -------
+        Encoded strings
+
+        Examples
+        --------
+        >>> import cudf
+        >>> from cudf.core.byte_pair_encoding import BytePairEncoder
+        >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t",
+        ...                    "c e", "es t", "en ce", "T h", "Th is",
+        ...                    "t est", "s ent", "t h", "th is"])
+        >>> bpe = BytePairEncoder(mps)
+        >>> str_series = cudf.Series(['This is the sentence', 'thisisit'])
+        >>> bpe(str_series)
+        0    This is a sent ence
+        1             this is it
+        dtype: object
+        """
+        sep = cudf.Scalar(separator, dtype="str")
+        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
+
+        return cudf.Series(result)
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index e565df8f3da..2dccd583b23 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf.core.byte_pair_encoding import BytePairEncoder
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing._utils import assert_eq
 
@@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings():
 
     actual = str1.str.jaccard_index(str2, jaccard_width)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "separator, input, results",
+    [
+        (" ", "thetestsentence", "the test sent ence"),
+        ("_", "sentenceistest", "sent_ence_is_test"),
+        ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"),
+    ],
+)
+def test_byte_pair_encoding(separator, input, results):
+    pairs_table = cudf.Series(
+        [
+            "t he",
+            "h e",
+            "e n",
+            "i t",
+            "i s",
+            "e s",
+            "en t",
+            "c e",
+            "es t",
+            "en ce",
+            "t h",
+            "h i",
+            "th is",
+            "t est",
+            "s i",
+            "s ent",
+        ]
+    )
+    encoder = BytePairEncoder(pairs_table)
+
+    strings = cudf.Series([input, None, "", input])
+
+    expected = cudf.Series([results, None, "", results])
+
+    actual = encoder(strings, separator)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)

From b446a6f187241e765c925da1053ece2679313a06 Mon Sep 17 00:00:00 2001
From: David Wendt <45795991+davidwendt@users.noreply.github.com>
Date: Tue, 14 Nov 2023 12:49:19 -0500
Subject: [PATCH 09/17] Fix token-count logic in
 nvtext::tokenize_with_vocabulary (#14393)

Fixes a bug introduced in #14336 when trying to simplify the token-counting logic as per this discussion https://github.com/rapidsai/cudf/pull/14336#discussion_r1378173552
The simplification caused an error which was found when running the nvtext benchmarks.
The appropriate gtest has been updated to cover this case now.

Authors:
  - David Wendt (https://github.com/davidwendt)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Karthikeyan (https://github.com/karthikeyann)

URL: https://github.com/rapidsai/cudf/pull/14393
---
 cpp/benchmarks/text/vocab.cpp       |  2 +-
 cpp/src/text/vocabulary_tokenize.cu |  8 ++++++--
 cpp/tests/text/tokenize_tests.cpp   | 12 ++++++------
 3 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
index 6922b7214ff..80942e2697d 100644
--- a/cpp/benchmarks/text/vocab.cpp
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -53,7 +53,7 @@ static void bench_vocab_tokenize(nvbench::state& state)
 
   auto const vocab_col = [] {
     data_profile const profile = data_profile_builder().no_validity().distribution(
-      cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, 15);
     auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
     return cudf::strings::filter_characters_of_type(
       cudf::strings_column_view(col->view()),
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 41f8c0a8731..511f1995374 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -276,8 +276,12 @@ __global__ void token_counts_fn(cudf::column_device_view const d_strings,
   __syncwarp();
 
   for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
-    // add one if at the edge of a token or at the string's end
-    count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end));
+    // add one if at the edge of a token or if at the string's end
+    if (*itr) {
+      count += !(*(itr - 1));
+    } else {
+      count += (itr + 1 == d_output_end);
+    }
   }
   __syncwarp();
 
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index 8118183a458..ea36e13de6f 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -246,14 +246,14 @@ TEST_F(TextTokenizeTest, Vocabulary)
 
 TEST_F(TextTokenizeTest, VocabularyLongStrings)
 {
-  cudf::test::strings_column_wrapper vocabulary(  // leaving out 'cat' on purpose
+  cudf::test::strings_column_wrapper vocabulary(
     {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
   auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
 
   std::vector<std::string> h_strings(
     4,
     "the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse "
-    "jumped over the mouse house with the dog");
+    "jumped  over  the mousé  house with the dog  ");
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
   auto input_view = cudf::strings_column_view(input);
   auto delimiter  = cudf::string_scalar(" ");
@@ -262,10 +262,10 @@ TEST_F(TextTokenizeTest, VocabularyLongStrings)
 
   using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
   // clang-format off
-  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}});
+  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 

From 8106a0c3d2050786f42152a280bd9315b897379e Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Tue, 14 Nov 2023 16:03:54 -0600
Subject: [PATCH 10/17] Cleanup remaining usages of dask dependencies (#14407)

This PR switches remaining usages of `dask` dependencies to use `rapids-dask-dependency`

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Bradley Dice (https://github.com/bdice)
  - Jake Awe (https://github.com/AyodeAwe)
  - Vyas Ramasubramani (https://github.com/vyasr)

URL: https://github.com/rapidsai/cudf/pull/14407
---
 conda/recipes/custreamz/meta.yaml   |  4 +---
 conda/recipes/dask-cudf/run_test.sh | 36 -----------------------------
 2 files changed, 1 insertion(+), 39 deletions(-)
 delete mode 100644 conda/recipes/dask-cudf/run_test.sh

diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index fb6efabffd4..b8c5918ea60 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,9 +45,7 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
+    - rapids-dask-dependency ={{ version }}
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
deleted file mode 100644
index e7238d00f2b..00000000000
--- a/conda/recipes/dask-cudf/run_test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-set -e
-
-# Logger function for build status output
-function logger() {
-  echo -e "\n>>>> $@\n"
-}
-
-# Importing cudf on arm64 CPU only nodes is currently not working due to a
-# difference in reported gpu devices between arm64 and amd64
-ARCH=$(arch)
-
-if [ "${ARCH}" = "aarch64" ]; then
-  logger "Skipping tests on arm64"
-  exit 0
-fi
-
-# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=1
-
-# Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2023.9.2"
-
-# Install the conda-forge or nightly version of dask and distributed
-if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
-    rapids-logger "rapids-mamba-retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
-    rapids-mamba-retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
-else
-    rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
-    rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall
-fi
-
-logger "python -c 'import dask_cudf'"
-python -c "import dask_cudf"

From 27b052d01ebdfd3690b90588971817423614acc0 Mon Sep 17 00:00:00 2001
From: shrshi <shruti.shivakumar@gmail.com>
Date: Tue, 14 Nov 2023 14:39:07 -0800
Subject: [PATCH 11/17] Added streams to CSV reader and writer api (#14340)

This PR contributes to https://github.com/rapidsai/cudf/issues/13744.
-Added stream parameters to public APIs
`cudf::io::read_csv`
`cudf::io::write_csv`
-Added stream gtests

Authors:
  - https://github.com/shrshi
  - Karthikeyan (https://github.com/karthikeyann)

Approvers:
  - Karthikeyan (https://github.com/karthikeyann)
  - Vukasin Milovanovic (https://github.com/vuule)
  - Yunsong Wang (https://github.com/PointKernel)

URL: https://github.com/rapidsai/cudf/pull/14340
---
 cpp/include/cudf/io/csv.hpp              |   4 +
 cpp/include/cudf/io/detail/csv.hpp       |   1 -
 cpp/include/cudf_test/column_wrapper.hpp |  16 ++--
 cpp/src/io/csv/writer_impl.cu            |  38 +++++----
 cpp/src/io/functions.cpp                 |  12 ++-
 cpp/tests/CMakeLists.txt                 |   1 +
 cpp/tests/streams/io/csv_test.cpp        | 102 +++++++++++++++++++++++
 7 files changed, 150 insertions(+), 24 deletions(-)
 create mode 100644 cpp/tests/streams/io/csv_test.cpp

diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index ac885c54356..435583e805d 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1307,6 +1307,7 @@ class csv_reader_options_builder {
  * @endcode
  *
  * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata
  *
@@ -1314,6 +1315,7 @@ class csv_reader_options_builder {
  */
 table_with_metadata read_csv(
   csv_reader_options options,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
@@ -1715,9 +1717,11 @@ class csv_writer_options_builder {
  * @endcode
  *
  * @param options Settings for controlling writing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
+               rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 9fdc7a47fb9..40ddcf385b0 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/io/csv.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index e94dfea9dcf..b9f2e0d9868 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -803,7 +803,8 @@ class strings_column_wrapper : public detail::column_wrapper {
       offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_bitmask = cudf::detail::make_device_uvector_sync(
       null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
-    wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count);
+    wrapped = cudf::make_strings_column(
+      d_chars, d_offsets, d_bitmask, null_count, cudf::test::get_default_stream());
   }
 
   /**
@@ -1846,7 +1847,8 @@ class structs_column_wrapper : public detail::column_wrapper {
                    child_column_wrappers.end(),
                    std::back_inserter(child_columns),
                    [&](auto const& column_wrapper) {
-                     return std::make_unique<cudf::column>(column_wrapper.get());
+                     return std::make_unique<cudf::column>(column_wrapper.get(),
+                                                           cudf::test::get_default_stream());
                    });
     init(std::move(child_columns), validity);
   }
@@ -1882,7 +1884,8 @@ class structs_column_wrapper : public detail::column_wrapper {
                    child_column_wrappers.end(),
                    std::back_inserter(child_columns),
                    [&](auto const& column_wrapper) {
-                     return std::make_unique<cudf::column>(column_wrapper.get());
+                     return std::make_unique<cudf::column>(column_wrapper.get(),
+                                                           cudf::test::get_default_stream());
                    });
     init(std::move(child_columns), validity_iter);
   }
@@ -1906,8 +1909,11 @@ class structs_column_wrapper : public detail::column_wrapper {
       return cudf::test::detail::make_null_mask(validity.begin(), validity.end());
     }();
 
-    wrapped = cudf::make_structs_column(
-      num_rows, std::move(child_columns), null_count, std::move(null_mask));
+    wrapped = cudf::make_structs_column(num_rows,
+                                        std::move(child_columns),
+                                        null_count,
+                                        std::move(null_mask),
+                                        cudf::test::get_default_stream());
   }
 
   template <typename V>
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 8c586306ad5..6e9c634804c 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -146,6 +146,12 @@ struct column_to_strings_fn {
   {
   }
 
+  ~column_to_strings_fn()                                      = default;
+  column_to_strings_fn(column_to_strings_fn const&)            = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn const&) = delete;
+  column_to_strings_fn(column_to_strings_fn&&)                 = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn&&)      = delete;
+
   // Note: `null` replacement with `na_rep` deferred to `concatenate()`
   // instead of column-wise; might be faster
   //
@@ -160,8 +166,9 @@ struct column_to_strings_fn {
   std::enable_if_t<std::is_same_v<column_type, bool>, std::unique_ptr<column>> operator()(
     column_view const& column) const
   {
-    return cudf::strings::detail::from_booleans(
-      column, options_.get_true_value(), options_.get_false_value(), stream_, mr_);
+    string_scalar true_string{options_.get_true_value(), true, stream_};
+    string_scalar false_string{options_.get_false_value(), true, stream_};
+    return cudf::strings::detail::from_booleans(column, true_string, false_string, stream_, mr_);
   }
 
   // strings:
@@ -367,10 +374,10 @@ void write_chunked(data_sink* out_sink,
 
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
-  cudf::string_scalar newline{options.get_line_terminator()};
+  cudf::string_scalar newline{options.get_line_terminator(), true, stream};
   auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
                                                             newline,
-                                                            string_scalar("", false),
+                                                            string_scalar{"", false, stream},
                                                             stream,
                                                             rmm::mr::get_current_device_resource());
   strings_column_view strings_column{p_str_col_w_nl->view()};
@@ -455,12 +462,14 @@ void write_csv(data_sink* out_sink,
 
       // populate vector of string-converted columns:
       //
-      std::transform(sub_view.begin(),
-                     sub_view.end(),
-                     std::back_inserter(str_column_vec),
-                     [converter](auto const& current_col) {
-                       return cudf::type_dispatcher(current_col.type(), converter, current_col);
-                     });
+      std::transform(
+        sub_view.begin(),
+        sub_view.end(),
+        std::back_inserter(str_column_vec),
+        [&converter = std::as_const(converter)](auto const& current_col) {
+          return cudf::type_dispatcher<cudf::id_to_type_impl, column_to_strings_fn const&>(
+            current_col.type(), converter, current_col);
+        });
 
       // create string table view from str_column_vec:
       //
@@ -470,18 +479,19 @@ void write_csv(data_sink* out_sink,
       // concatenate columns in each row into one big string column
       // (using null representation and delimiter):
       //
-      std::string delimiter_str{options.get_inter_column_delimiter()};
       auto str_concat_col = [&] {
+        cudf::string_scalar delimiter_str{
+          std::string{options.get_inter_column_delimiter()}, true, stream};
+        cudf::string_scalar options_narep{options.get_na_rep(), true, stream};
         if (str_table_view.num_columns() > 1)
           return cudf::strings::detail::concatenate(str_table_view,
                                                     delimiter_str,
-                                                    options.get_na_rep(),
+                                                    options_narep,
                                                     strings::separator_on_nulls::YES,
                                                     stream,
                                                     rmm::mr::get_current_device_resource());
-        cudf::string_scalar narep{options.get_na_rep()};
         return cudf::strings::detail::replace_nulls(
-          str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource());
+          str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
       }();
 
       write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 00d56008611..964e40e36cd 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -230,7 +230,9 @@ void write_json(json_writer_options const& options,
     mr);
 }
 
-table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(csv_reader_options options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -245,12 +247,14 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_
   return cudf::io::detail::csv::read_csv(  //
     std::move(datasources[0]),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
 // Freeform API wraps the detail writer class API
-void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resource* mr)
+void write_csv(csv_writer_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
 {
   using namespace cudf::io::detail;
 
@@ -262,7 +266,7 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
     options.get_table(),
     options.get_names(),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index 7b628649051..1be8566fb0f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -635,6 +635,7 @@ ConfigureTest(
 ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
new file mode 100644
index 00000000000..88514fa412c
--- /dev/null
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/csv.hpp>
+#include <cudf/io/detail/csv.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <string>
+#include <vector>
+
+auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+class CSVTest : public cudf::test::BaseFixture {};
+
+TEST_F(CSVTest, CSVWriter)
+{
+  constexpr auto num_rows = 10;
+
+  std::vector<size_t> zeros(num_rows, 0);
+  std::vector<size_t> ones(num_rows, 1);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
+  });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+
+  std::vector<std::string> col8_data(num_rows, "rapids");
+  cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
+
+  cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv";
+  auto w_options      = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab)
+                     .include_header(false)
+                     .inter_column_delimiter(',');
+  cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream());
+}
+
+TEST_F(CSVTest, CSVReader)
+{
+  constexpr auto num_rows = 10;
+
+  std::vector<size_t> zeros(num_rows, 0);
+  std::vector<size_t> ones(num_rows, 1);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
+  });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+
+  std::vector<std::string> col8_data(num_rows, "rapids");
+  cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
+
+  cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv";
+  auto w_options      = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab)
+                     .include_header(false)
+                     .inter_column_delimiter(',');
+  cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream());
+}

From 330d389b26a05676d9f079503a3d96b571762337 Mon Sep 17 00:00:00 2001
From: Robert Maynard <rmaynard@nvidia.com>
Date: Tue, 14 Nov 2023 17:56:55 -0500
Subject: [PATCH 12/17] Ensure nvbench initializes nvml context when built
 statically (#14411)

Port https://github.com/NVIDIA/nvbench/pull/148 to cudf so that nvbench benchmarks work now that we always use a static version of nvbench.

Authors:
  - Robert Maynard (https://github.com/robertmaynard)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14411
---
 cpp/cmake/thirdparty/patches/nvbench_override.json | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index ad9b19c29c1..f85bdb9486c 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -7,6 +7,11 @@
           "file" : "${current_json_dir}/nvbench_global_setup.diff",
           "issue" : "Fix add support for global setup to initialize RMM in nvbench [https://github.com/NVIDIA/nvbench/pull/123]",
           "fixed_in" : ""
+        },
+        {
+          "file" : "nvbench/nvml_with_static_builds.diff",
+          "issue" : "Add support for nvml with static nvbench [https://github.com/NVIDIA/nvbench/pull/148]",
+          "fixed_in" : ""
         }
       ]
     }

From 8a0a08f34ff804a7329ea640aa1e0a9b188d2162 Mon Sep 17 00:00:00 2001
From: Matthew Roeschke <10647082+mroeschke@users.noreply.github.com>
Date: Tue, 14 Nov 2023 17:55:16 -1000
Subject: [PATCH 13/17] Fix as_column(pd.Timestamp/Timedelta, length=) not
 respecting length (#14390)

Noticed this while trying to clean up `as_column`

Authors:
  - Matthew Roeschke (https://github.com/mroeschke)

Approvers:
  - Bradley Dice (https://github.com/bdice)

URL: https://github.com/rapidsai/cudf/pull/14390
---
 python/cudf/cudf/core/column/column.py |  5 ++++-
 python/cudf/cudf/tests/test_column.py  | 13 ++++++++-----
 2 files changed, 12 insertions(+), 6 deletions(-)

diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a5e99abd79e..b4f65693d85 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2102,7 +2102,10 @@ def as_column(
     elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)):
         # This will always treat NaTs as nulls since it's not technically a
         # discrete value like NaN
-        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))
+        length = length or 1
+        data = as_column(
+            pa.array(pd.Series([arbitrary] * length), from_pandas=True)
+        )
         if dtype is not None:
             data = data.astype(dtype)
 
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index db0446d506c..0546638f388 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -193,12 +193,15 @@ def test_column_mixed_dtype(data, error):
 
 
 @pytest.mark.parametrize("nan_as_null", [True, False])
-def test_as_column_scalar_with_nan(nan_as_null):
-    size = 10
-    scalar = np.nan
-
+@pytest.mark.parametrize(
+    "scalar",
+    [np.nan, pd.Timedelta(days=1), pd.Timestamp(2020, 1, 1)],
+    ids=repr,
+)
+@pytest.mark.parametrize("size", [1, 10])
+def test_as_column_scalar_with_nan(nan_as_null, scalar, size):
     expected = (
-        cudf.Series([np.nan] * size, nan_as_null=nan_as_null)
+        cudf.Series([scalar] * size, nan_as_null=nan_as_null)
         .dropna()
         .to_numpy()
     )

From ab2248ea8e693143823d02bb8b806c65bfc3bf32 Mon Sep 17 00:00:00 2001
From: Vukasin Milovanovic <vmilovanovic@nvidia.com>
Date: Tue, 14 Nov 2023 23:30:51 -0800
Subject: [PATCH 14/17] Fix and disable encoding for nanosecond statistics in
 ORC writer (#14367)

Issue https://github.com/rapidsai/cudf/issues/14325

Use uint when reading/writing nano stats because nanoseconds have int32 encoding (different from both unit32 and sint32, _obviously_), which does not use zigzag.
sint32 uses zigzag, and unit32 does not allow negative numbers, so we can use uint since we'll never have negative nanoseconds.

Also disabled the nanoseconds because it should only be written after ORC-135; we don't write the version so readers get confused if nanoseconds are there. Planning to re-enable once we start writing the version.

Authors:
  - Vukasin Milovanovic (https://github.com/vuule)

Approvers:
  - Vyas Ramasubramani (https://github.com/vyasr)
  - Nghia Truong (https://github.com/ttnghia)

URL: https://github.com/rapidsai/cudf/pull/14367
---
 cpp/include/cudf/io/orc_metadata.hpp |  8 +++----
 cpp/src/io/orc/orc.cpp               | 13 +++++++++++
 cpp/src/io/orc/orc.hpp               |  6 +++++
 cpp/src/io/orc/stats_enc.cu          | 35 +++++++++++++++++++++-------
 cpp/tests/io/orc_test.cpp            | 16 +++++++++----
 5 files changed, 61 insertions(+), 17 deletions(-)

diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 82d59803c25..9531a012e49 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -141,10 +141,10 @@ using binary_statistics = sum_statistics<int64_t>;
  * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
  */
 struct timestamp_statistics : minmax_statistics<int64_t> {
-  std::optional<int64_t> minimum_utc;    ///< minimum in milliseconds
-  std::optional<int64_t> maximum_utc;    ///< maximum in milliseconds
-  std::optional<int32_t> minimum_nanos;  ///< nanoseconds part of the minimum
-  std::optional<int32_t> maximum_nanos;  ///< nanoseconds part of the maximum
+  std::optional<int64_t> minimum_utc;     ///< minimum in milliseconds
+  std::optional<int64_t> maximum_utc;     ///< maximum in milliseconds
+  std::optional<uint32_t> minimum_nanos;  ///< nanoseconds part of the minimum
+  std::optional<uint32_t> maximum_nanos;  ///< nanoseconds part of the maximum
 };
 
 namespace orc {
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index bc399b75ef9..ee5fa4e8b5a 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -182,6 +182,19 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen)
                        field_reader(5, s.minimum_nanos),
                        field_reader(6, s.maximum_nanos));
   function_builder(s, maxlen, op);
+
+  // Adjust nanoseconds because they are encoded as (value + 1)
+  // Range [1, 1000'000] is translated here to [0, 999'999]
+  if (s.minimum_nanos.has_value()) {
+    auto& min_nanos = s.minimum_nanos.value();
+    CUDF_EXPECTS(min_nanos >= 1 and min_nanos <= 1000'000, "Invalid minimum nanoseconds");
+    --min_nanos;
+  }
+  if (s.maximum_nanos.has_value()) {
+    auto& max_nanos = s.maximum_nanos.value();
+    CUDF_EXPECTS(max_nanos >= 1 and max_nanos <= 1000'000, "Invalid maximum nanoseconds");
+    --max_nanos;
+  }
 }
 
 void ProtobufReader::read(column_statistics& s, size_t maxlen)
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 6f65e384d2d..783ed4206b6 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -41,6 +41,12 @@ static constexpr uint32_t block_header_size = 3;
 // Seconds from January 1st, 1970 to January 1st, 2015
 static constexpr int64_t orc_utc_epoch = 1420070400;
 
+// Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max
+// are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder,
+// the maximum nanosecond remainder is 999,999 (nanoseconds in a millisecond - 1).
+static constexpr int32_t DEFAULT_MIN_NANOS = 0;
+static constexpr int32_t DEFAULT_MAX_NANOS = 999'999;
+
 struct PostScript {
   uint64_t footerLength       = 0;     // the length of the footer section in bytes
   CompressionKind compression = NONE;  // the kind of generic compression used
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 479a2dfada3..429fd5b929d 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -27,6 +27,10 @@ namespace cudf::io::orc::gpu {
 
 using strings::detail::fixed_point_string_size;
 
+// Nanosecond statistics should not be enabled until the spec version is set correctly in the output
+// files. See https://github.com/rapidsai/cudf/issues/14325 for more details
+constexpr bool enable_nanosecond_statistics = false;
+
 constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;
 constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block;
@@ -96,8 +100,10 @@ __global__ void __launch_bounds__(block_size, 1)
           stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64);
           break;
         case dtype_timestamp64:
-          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) +
-                      2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64);
+          if constexpr (enable_nanosecond_statistics) {
+            stats_len += 2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          }
           break;
         case dtype_float32:
         case dtype_float64:
@@ -405,7 +411,8 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch
         //  optional sint64 maximumUtc = 4;
         //  optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond
-        //  precision optional int32 maximumNanos = 6;
+        //  precision
+        // optional int32 maximumNanos = 6;
         // }
         if (s->chunk.has_minmax) {
           cur[0] = 9 * 8 + ProtofType::FIXEDLEN;
@@ -416,12 +423,22 @@ __global__ void __launch_bounds__(encode_threads_per_block)
             split_nanosecond_timestamp(s->chunk.max_value.i_val);
 
           // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC
-          cur          = pb_put_int(cur, 1, min_ms);            // minimum
-          cur          = pb_put_int(cur, 2, max_ms);            // maximum
-          cur          = pb_put_int(cur, 3, min_ms);            // minimumUtc
-          cur          = pb_put_int(cur, 4, max_ms);            // maximumUtc
-          cur          = pb_put_int(cur, 5, min_ns_remainder);  // minimumNanos
-          cur          = pb_put_int(cur, 6, max_ns_remainder);  // maximumNanos
+          cur = pb_put_int(cur, 1, min_ms);  // minimum
+          cur = pb_put_int(cur, 2, max_ms);  // maximum
+          cur = pb_put_int(cur, 3, min_ms);  // minimumUtc
+          cur = pb_put_int(cur, 4, max_ms);  // maximumUtc
+
+          if constexpr (enable_nanosecond_statistics) {
+            if (min_ns_remainder != DEFAULT_MIN_NANOS) {
+              // using uint because positive values are not zigzag encoded
+              cur = pb_put_uint(cur, 5, min_ns_remainder + 1);  // minimumNanos
+            }
+            if (max_ns_remainder != DEFAULT_MAX_NANOS) {
+              // using uint because positive values are not zigzag encoded
+              cur = pb_put_uint(cur, 6, max_ns_remainder + 1);  // maximumNanos
+            }
+          }
+
           fld_start[1] = cur - (fld_start + 2);
         }
         break;
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 234716749ff..dca3886db14 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1054,8 +1054,12 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts4.maximum, 3);
     EXPECT_EQ(*ts4.minimum_utc, -4);
     EXPECT_EQ(*ts4.maximum_utc, 3);
-    EXPECT_EQ(*ts4.minimum_nanos, 999994);
-    EXPECT_EQ(*ts4.maximum_nanos, 6);
+    // nanosecond precision can't be included until we write a writer version that includes ORC-135
+    // see https://github.com/rapidsai/cudf/issues/14325
+    // EXPECT_EQ(*ts4.minimum_nanos, 999994);
+    EXPECT_FALSE(ts4.minimum_nanos.has_value());
+    // EXPECT_EQ(*ts4.maximum_nanos, 6);
+    EXPECT_FALSE(ts4.maximum_nanos.has_value());
 
     auto& s5 = stats[5];
     EXPECT_EQ(*s5.number_of_values, 4ul);
@@ -1065,8 +1069,12 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts5.maximum, 3000);
     EXPECT_EQ(*ts5.minimum_utc, -3001);
     EXPECT_EQ(*ts5.maximum_utc, 3000);
-    EXPECT_EQ(*ts5.minimum_nanos, 994000);
-    EXPECT_EQ(*ts5.maximum_nanos, 6000);
+    // nanosecond precision can't be included until we write a writer version that includes ORC-135
+    // see https://github.com/rapidsai/cudf/issues/14325
+    // EXPECT_EQ(*ts5.minimum_nanos, 994000);
+    EXPECT_FALSE(ts5.minimum_nanos.has_value());
+    // EXPECT_EQ(*ts5.maximum_nanos, 6000);
+    EXPECT_FALSE(ts5.maximum_nanos.has_value());
 
     auto& s6 = stats[6];
     EXPECT_EQ(*s6.number_of_values, 4ul);

From 8deb3dd7573000e7d87f18a9e2bbe39cf2932e10 Mon Sep 17 00:00:00 2001
From: GALI PREM SAGAR <sagarprem75@gmail.com>
Date: Wed, 15 Nov 2023 07:56:37 -0600
Subject: [PATCH 15/17] Raise error in `reindex` when `index` is not unique
 (#14400)

Fixes: #14398
This PR raises an error in `reindex` API when reindexing is performed on a non-unique index column.

Authors:
  - GALI PREM SAGAR (https://github.com/galipremsagar)

Approvers:
  - Matthew Roeschke (https://github.com/mroeschke)
  - Lawrence Mitchell (https://github.com/wence-)

URL: https://github.com/rapidsai/cudf/pull/14400
---
 python/cudf/cudf/core/indexed_frame.py   |  4 ++++
 python/cudf/cudf/tests/test_dataframe.py | 12 ++++++++++++
 python/cudf/cudf/tests/test_series.py    | 12 ++++++++++++
 python/dask_cudf/dask_cudf/backends.py   | 13 ++++---------
 4 files changed, 32 insertions(+), 9 deletions(-)

diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 376bef6d0b2..4211a8c24bf 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2607,6 +2607,10 @@ def _reindex(
 
         df = self
         if index is not None:
+            if not df._index.is_unique:
+                raise ValueError(
+                    "cannot reindex on an axis with duplicate labels"
+                )
             index = cudf.core.index.as_index(
                 index, name=getattr(index, "name", self._index.name)
             )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d44cf594e8b..5677f97408a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10723,3 +10723,15 @@ def test_dataframe_series_dot():
     expected = gser @ [12, 13]
 
     assert_eq(expected, actual)
+
+
+def test_dataframe_duplicate_index_reindex():
+    gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
+    pdf = gdf.to_pandas()
+
+    assert_exceptions_equal(
+        gdf.reindex,
+        pdf.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8f8f87c20e0..c15a797713f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype():
     s = cudf.Series([True, False, True])
     with pytest.raises(TypeError):
         s[0] = 10
+
+
+def test_series_duplicate_index_reindex():
+    gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
+    ps = gs.to_pandas()
+
+    assert_exceptions_equal(
+        gs.reindex,
+        ps.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index b0da82eaeee..387643587d1 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -427,17 +427,12 @@ def union_categoricals_cudf(
     )
 
 
-@_dask_cudf_nvtx_annotate
-def safe_hash(frame):
-    return cudf.Series(frame.hash_values(), index=frame.index)
-
-
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
 @_dask_cudf_nvtx_annotate
 def hash_object_cudf(frame, index=True):
     if index:
-        return safe_hash(frame.reset_index())
-    return safe_hash(frame)
+        frame = frame.reset_index()
+    return frame.hash_values()
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
@@ -445,10 +440,10 @@ def hash_object_cudf(frame, index=True):
 def hash_object_cudf_index(ind, index=None):
 
     if isinstance(ind, cudf.MultiIndex):
-        return safe_hash(ind.to_frame(index=False))
+        return ind.to_frame(index=False).hash_values()
 
     col = cudf.core.column.as_column(ind)
-    return safe_hash(cudf.Series(col))
+    return cudf.Series(col).hash_values()
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))

From 9e7f8a5fdd03d6a24630687621d0ee14c2db26d7 Mon Sep 17 00:00:00 2001
From: Vyas Ramasubramani <vyasr@nvidia.com>
Date: Wed, 15 Nov 2023 15:27:28 -0800
Subject: [PATCH 16/17] Fix dask dependency in custreamz (#14420)

#14407 added a dask dependency to custreamz, but it added too tight of a pinning by requiring the exact same version. This is not valid because rapids-dask-dependency won't release a new version corresponding to each new cudf release, so pinning to the exact same version up to the alpha creates an unsatisfiable constraint.

Authors:
   - Vyas Ramasubramani (https://github.com/vyasr)

Approvers:
   - Ray Douglass (https://github.com/raydouglass)
   - Bradley Dice (https://github.com/bdice)
   - GALI PREM SAGAR (https://github.com/galipremsagar)
---
 conda/recipes/custreamz/meta.yaml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index b8c5918ea60..755394e3936 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,7 +45,7 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - rapids-dask-dependency ={{ version }}
+    - rapids-dask-dependency ={{ minor_version }}
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 

From e4e69757b340ce45e8ceca53047f079c8b3eb648 Mon Sep 17 00:00:00 2001
From: Bradley Dice <bdice@bradleydice.com>
Date: Wed, 15 Nov 2023 16:31:03 -0800
Subject: [PATCH 17/17] Update cudf_kafka_version.

---
 python/cudf_kafka/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt
index d55c3fdc076..1e21c873585 100644
--- a/python/cudf_kafka/CMakeLists.txt
+++ b/python/cudf_kafka/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
 
-set(cudf_kafka_version 23.12.00)
+set(cudf_kafka_version 24.02.00)
 
 include(../../fetch_rapids.cmake)