diff --git a/build.sh b/build.sh
index 2ad69712e5d..e5beb51dedf 100755
--- a/build.sh
+++ b/build.sh
@@ -369,7 +369,7 @@ fi
 # build cudf_kafka Python package
 if hasArg cudf_kafka; then
     cd ${REPODIR}/python/cudf_kafka
-    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR}" \
+    SKBUILD_CONFIGURE_OPTIONS="-DCMAKE_PREFIX_PATH=${INSTALL_PREFIX} -DCMAKE_LIBRARY_PATH=${LIBCUDF_BUILD_DIR} ${EXTRA_CMAKE_ARGS}" \
         SKBUILD_BUILD_OPTIONS="-j${PARALLEL_LEVEL:-1}" \
         python -m pip install --no-build-isolation --no-deps .
 fi
diff --git a/ci/build_wheel.sh b/ci/build_wheel.sh
index 08716cdb3d9..ae1d9c3fb1a 100755
--- a/ci/build_wheel.sh
+++ b/ci/build_wheel.sh
@@ -36,6 +36,8 @@ fi
 
 if [[ ${package_name} == "dask_cudf" ]]; then
     sed -r -i "s/cudf==(.*)\"/cudf${PACKAGE_CUDA_SUFFIX}==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/dask-cuda==(.*)\"/dask-cuda==\1${alpha_spec}\"/g" ${pyproject_file}
+    sed -r -i "s/rapids-dask-dependency==(.*)\"/rapids-dask-dependency==\1${alpha_spec}\"/g" ${pyproject_file}
 else
     sed -r -i "s/rmm(.*)\"/rmm${PACKAGE_CUDA_SUFFIX}\1${alpha_spec}\"/g" ${pyproject_file}
     # ptxcompiler and cubinlinker aren't version constrained
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 7574b4174e9..4f1cbc47d1d 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -43,6 +43,7 @@ sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g'
 
 # Python CMakeLists updates
 sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt
+sed_runner 's/'"cudf_kafka_version .*)"'/'"cudf_kafka_version ${NEXT_FULL_TAG})"'/g' python/cudf_kafka/CMakeLists.txt
 
 # cpp libcudf_kafka update
 sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/libcudf_kafka/CMakeLists.txt
@@ -80,6 +81,7 @@ DEPENDENCIES=(
   kvikio
   libkvikio
   librmm
+  rapids-dask-dependency
   rmm
 )
 for DEP in "${DEPENDENCIES[@]}"; do
diff --git a/ci/test_wheel_dask_cudf.sh b/ci/test_wheel_dask_cudf.sh
index ec2b1dbf218..e9162b816aa 100755
--- a/ci/test_wheel_dask_cudf.sh
+++ b/ci/test_wheel_dask_cudf.sh
@@ -23,9 +23,6 @@ manylinux="manylinux_${manylinux_version}"
 RAPIDS_PY_WHEEL_NAME="cudf_${manylinux}_${RAPIDS_PY_CUDA_SUFFIX}" rapids-download-wheels-from-s3 ./local-cudf-dep
 python -m pip install --no-deps ./local-cudf-dep/cudf*.whl
 
-# Always install latest dask for testing
-python -m pip install git+https://github.com/dask/dask.git@main git+https://github.com/dask/distributed.git@main git+https://github.com/rapidsai/dask-cuda.git@branch-24.02
-
 # echo to expand wildcard before adding `[extra]` requires for pip
 python -m pip install $(echo ./dist/dask_cudf*.whl)[test]
 
diff --git a/conda/environments/all_cuda-118_arch-x86_64.yaml b/conda/environments/all_cuda-118_arch-x86_64.yaml
index 90f31c645e1..1a944fbdb21 100644
--- a/conda/environments/all_cuda-118_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-118_arch-x86_64.yaml
@@ -24,11 +24,8 @@ dependencies:
 - cudatoolkit
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=3.0.0
-- dask-core>=2023.9.2
+- cython>=3.0.3
 - dask-cuda==24.2.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -60,7 +57,7 @@ dependencies:
 - numpy>=1.21,<1.25
 - numpydoc
 - nvcc_linux-64=11.8
-- nvcomp==2.6.1
+- nvcomp==3.0.4
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
@@ -80,6 +77,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rapids-dask-dependency==24.2.*
 - rich
 - rmm==24.2.*
 - s3fs>=2022.3.0
diff --git a/conda/environments/all_cuda-120_arch-x86_64.yaml b/conda/environments/all_cuda-120_arch-x86_64.yaml
index 538b47e10ca..21837b652f4 100644
--- a/conda/environments/all_cuda-120_arch-x86_64.yaml
+++ b/conda/environments/all_cuda-120_arch-x86_64.yaml
@@ -17,7 +17,6 @@ dependencies:
 - cmake>=3.26.4
 - cramjam
 - cuda-cudart-dev
-- cuda-gdb
 - cuda-nvcc
 - cuda-nvrtc-dev
 - cuda-nvtx-dev
@@ -26,11 +25,8 @@ dependencies:
 - cuda-version=12.0
 - cupy>=12.0.0
 - cxx-compiler
-- cython>=3.0.0
-- dask-core>=2023.9.2
+- cython>=3.0.3
 - dask-cuda==24.2.*
-- dask>=2023.9.2
-- distributed>=2023.9.2
 - dlpack>=0.5,<0.6.0a0
 - doxygen=1.9.1
 - fastavro>=0.22.9
@@ -59,7 +55,7 @@ dependencies:
 - numba>=0.57,<0.58
 - numpy>=1.21,<1.25
 - numpydoc
-- nvcomp==2.6.1
+- nvcomp==3.0.4
 - nvtx>=0.2.1
 - packaging
 - pandas>=1.3,<1.6.0dev0
@@ -78,6 +74,7 @@ dependencies:
 - python-snappy>=0.6.0
 - python>=3.9,<3.11
 - pytorch<1.12.0
+- rapids-dask-dependency==24.2.*
 - rich
 - rmm==24.2.*
 - s3fs>=2022.3.0
diff --git a/conda/recipes/cudf/meta.yaml b/conda/recipes/cudf/meta.yaml
index 9b5c5f3d14b..27edde1c98a 100644
--- a/conda/recipes/cudf/meta.yaml
+++ b/conda/recipes/cudf/meta.yaml
@@ -57,7 +57,7 @@ requirements:
   host:
     - protobuf ==4.24.*
     - python
-    - cython >=3.0.0
+    - cython >=3.0.3
     - scikit-build >=0.13.1
     - setuptools
     - dlpack >=0.5,<0.6.0a0
diff --git a/conda/recipes/cudf_kafka/build.sh b/conda/recipes/cudf_kafka/build.sh
index f4bb6e1bc91..9458349d101 100644
--- a/conda/recipes/cudf_kafka/build.sh
+++ b/conda/recipes/cudf_kafka/build.sh
@@ -1,16 +1,3 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
-# This assumes the script is executed from the root of the repo directory
-# Need to set CUDA_HOME inside conda environments because the hacked together
-# setup.py for cudf-kafka searches that way.
-# TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-# cudf_kafka to use scikit-build
-CUDA_MAJOR=${RAPIDS_CUDA_VERSION%%.*}
-if [[ ${CUDA_MAJOR} == "12" ]]; then
-    target_name="x86_64-linux"
-    if [[ ! $(arch) == "x86_64" ]]; then
-        target_name="sbsa-linux"
-    fi
-    export CUDA_HOME="${PREFIX}/targets/${target_name}/"
-fi
 ./build.sh -v cudf_kafka
diff --git a/conda/recipes/cudf_kafka/conda_build_config.yaml b/conda/recipes/cudf_kafka/conda_build_config.yaml
index b63a136ad2d..c98c2701653 100644
--- a/conda/recipes/cudf_kafka/conda_build_config.yaml
+++ b/conda/recipes/cudf_kafka/conda_build_config.yaml
@@ -9,3 +9,9 @@ sysroot_version:
 
 cmake_version:
   - ">=3.26.4"
+
+cuda_compiler:
+  - cuda-nvcc
+
+cuda11_compiler:
+  - nvcc
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index cdc547b4d68..343ec2519f1 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -33,28 +33,31 @@ build:
     - SCCACHE_S3_KEY_PREFIX=cudf-kafka-linux64 # [linux64]
     - SCCACHE_S3_USE_SSL
     - SCCACHE_S3_NO_CREDENTIALS
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-    # cudf_kafka to use scikit-build
-    - RAPIDS_CUDA_VERSION
+  ignore_run_exports_from:
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }}
+    {% endif %}
 
 requirements:
   build:
     - cmake {{ cmake_version }}
+    - ninja
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
-    - ninja
-    - sysroot_{{ target_platform }} {{ sysroot_version }}
-    # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-    # cudf_kafka to use scikit-build
-    {% if cuda_major == "12" %}
-    - cuda-gdb
+    {% if cuda_major == "11" %}
+    - {{ compiler('cuda11') }} ={{ cuda_version }}
+    {% else %}
+    - {{ compiler('cuda') }}
     {% endif %}
+    - cuda-version ={{ cuda_version }}
+    - sysroot_{{ target_platform }} {{ sysroot_version }}
   host:
     - python
-    - cython >=3.0.0
+    - cython >=3.0.3
     - cuda-version ={{ cuda_version }}
     - cudf ={{ version }}
     - libcudf_kafka ={{ version }}
+    - scikit-build >=0.13.1
     - setuptools
     {% if cuda_major == "12" %}
     - cuda-cudart-dev
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index fb6efabffd4..755394e3936 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -45,9 +45,7 @@ requirements:
     - streamz
     - cudf ={{ version }}
     - cudf_kafka ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
+    - rapids-dask-dependency ={{ minor_version }}
     - python-confluent-kafka >=1.9.0,<1.10.0a0
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 9dc9f76d9f5..16638926492 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -37,17 +37,11 @@ build:
 requirements:
   host:
     - python
-    - cudf ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
     - cuda-version ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask >=2023.9.2
-    - dask-core >=2023.9.2
-    - distributed >=2023.9.2
+    - rapids-dask-dependency ={{ minor_version }}
     - {{ pin_compatible('cuda-version', max_pin='x', min_pin='x') }}
 
 test:
diff --git a/conda/recipes/dask-cudf/run_test.sh b/conda/recipes/dask-cudf/run_test.sh
deleted file mode 100644
index e7238d00f2b..00000000000
--- a/conda/recipes/dask-cudf/run_test.sh
+++ /dev/null
@@ -1,36 +0,0 @@
-#!/bin/bash
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-
-set -e
-
-# Logger function for build status output
-function logger() {
-  echo -e "\n>>>> $@\n"
-}
-
-# Importing cudf on arm64 CPU only nodes is currently not working due to a
-# difference in reported gpu devices between arm64 and amd64
-ARCH=$(arch)
-
-if [ "${ARCH}" = "aarch64" ]; then
-  logger "Skipping tests on arm64"
-  exit 0
-fi
-
-# Dask & Distributed option to install main(nightly) or `conda-forge` packages.
-export INSTALL_DASK_MAIN=1
-
-# Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2023.9.2"
-
-# Install the conda-forge or nightly version of dask and distributed
-if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
-    rapids-logger "rapids-mamba-retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
-    rapids-mamba-retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
-else
-    rapids-logger "rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
-    rapids-mamba-retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall
-fi
-
-logger "python -c 'import dask_cudf'"
-python -c "import dask_cudf"
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 05b2135184b..fa06ed048b7 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -38,7 +38,7 @@ spdlog_version:
   - ">=1.11.0,<1.12"
 
 nvcomp_version:
-  - "=2.6.1"
+  - "=3.0.4"
 
 zlib_version:
   - ">=1.2.13"
diff --git a/cpp/benchmarks/text/vocab.cpp b/cpp/benchmarks/text/vocab.cpp
index 6922b7214ff..80942e2697d 100644
--- a/cpp/benchmarks/text/vocab.cpp
+++ b/cpp/benchmarks/text/vocab.cpp
@@ -53,7 +53,7 @@ static void bench_vocab_tokenize(nvbench::state& state)
 
   auto const vocab_col = [] {
     data_profile const profile = data_profile_builder().no_validity().distribution(
-      cudf::type_id::STRING, distribution_id::NORMAL, 0, 5);
+      cudf::type_id::STRING, distribution_id::NORMAL, 0, 15);
     auto const col = create_random_column(cudf::type_id::STRING, row_count{100}, profile);
     return cudf::strings::filter_characters_of_type(
       cudf::strings_column_view(col->view()),
diff --git a/cpp/cmake/thirdparty/get_nvbench.cmake b/cpp/cmake/thirdparty/get_nvbench.cmake
index f0642145fa0..bbd22693ba4 100644
--- a/cpp/cmake/thirdparty/get_nvbench.cmake
+++ b/cpp/cmake/thirdparty/get_nvbench.cmake
@@ -21,7 +21,7 @@ function(find_and_configure_nvbench)
   set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
   rapids_cpm_package_override("${cudf_patch_dir}/nvbench_override.json")
 
-  rapids_cpm_nvbench()
+  rapids_cpm_nvbench(BUILD_STATIC)
 
 endfunction()
 
diff --git a/cpp/cmake/thirdparty/patches/nvbench_override.json b/cpp/cmake/thirdparty/patches/nvbench_override.json
index 7be868081b6..f85bdb9486c 100644
--- a/cpp/cmake/thirdparty/patches/nvbench_override.json
+++ b/cpp/cmake/thirdparty/patches/nvbench_override.json
@@ -9,8 +9,8 @@
           "fixed_in" : ""
         },
         {
-          "file" : "nvbench/use_existing_fmt.diff",
-          "issue" : "Fix add support for using an existing fmt [https://github.com/NVIDIA/nvbench/pull/125]",
+          "file" : "nvbench/nvml_with_static_builds.diff",
+          "issue" : "Add support for nvml with static nvbench [https://github.com/NVIDIA/nvbench/pull/148]",
           "fixed_in" : ""
         }
       ]
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 35851a99822..b1ff0bbaea7 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -16,6 +16,7 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
 #include <cudf/detail/utilities/alignment.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/lists/list_view.hpp>
@@ -442,10 +443,11 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
   __device__ T element(size_type element_index) const noexcept
   {
     size_type index       = element_index + offset();  // account for this view's _offset
-    auto const* d_offsets = d_children[strings_column_view::offsets_column_index].data<int32_t>();
     char const* d_strings = d_children[strings_column_view::chars_column_index].data<char>();
-    size_type offset      = d_offsets[index];
-    return string_view{d_strings + offset, d_offsets[index + 1] - offset};
+    auto const offsets    = d_children[strings_column_view::offsets_column_index];
+    auto const itr        = cudf::detail::input_offsetalator(offsets.head(), offsets.type());
+    auto const offset     = itr[index];
+    return string_view{d_strings + offset, static_cast<cudf::size_type>(itr[index + 1] - offset)};
   }
 
  private:
diff --git a/cpp/include/cudf/detail/indexalator.cuh b/cpp/include/cudf/detail/indexalator.cuh
index 6532dae3695..4d261c54b29 100644
--- a/cpp/include/cudf/detail/indexalator.cuh
+++ b/cpp/include/cudf/detail/indexalator.cuh
@@ -56,10 +56,69 @@ namespace detail {
  *  auto result = thrust::find(thrust::device, begin, end, size_type{12} );
  * @endcode
  */
-using input_indexalator = input_normalator<cudf::size_type>;
+struct input_indexalator : base_normalator<input_indexalator, cudf::size_type> {
+  friend struct base_normalator<input_indexalator, cudf::size_type>;  // for CRTP
+
+  using reference = cudf::size_type const;  // this keeps STL and thrust happy
+
+  input_indexalator()                                    = default;
+  input_indexalator(input_indexalator const&)            = default;
+  input_indexalator(input_indexalator&&)                 = default;
+  input_indexalator& operator=(input_indexalator const&) = default;
+  input_indexalator& operator=(input_indexalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline cudf::size_type operator*() const { return operator[](0); }
+
+  /**
+   * @brief Dispatch functor for resolving a Integer value from any integer type
+   */
+  struct normalize_type {
+    template <typename T, CUDF_ENABLE_IF(cudf::is_index_type<T>())>
+    __device__ cudf::size_type operator()(void const* tp)
+    {
+      return static_cast<cudf::size_type>(*static_cast<T const*>(tp));
+    }
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_index_type<T>())>
+    __device__ cudf::size_type operator()(void const*)
+    {
+      CUDF_UNREACHABLE("only integral types are supported");
+    }
+  };
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a `Integer` value.
+   */
+  __device__ inline cudf::size_type operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return type_dispatcher(this->dtype_, normalize_type{}, tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data   Pointer to an integer array in device memory.
+   * @param dtype  Type of data in data
+   * @param offset Applied to the data pointer per size of the type
+   */
+  CUDF_HOST_DEVICE input_indexalator(void const* data, data_type dtype, cudf::size_type offset = 0)
+    : base_normalator<input_indexalator, cudf::size_type>(dtype), p_{static_cast<char const*>(data)}
+  {
+    p_ += offset * this->width_;
+  }
+
+ protected:
+  char const* p_;  /// pointer to the integer data in device memory
+};
 
 /**
- * @brief The index normalizing output iterator.
+ * @brief The index normalizing output iterator
  *
  * This is an iterator that can be used for index types (integers) without
  * requiring a type-specific instance. It can be used for any iterator
@@ -82,7 +141,75 @@ using input_indexalator = input_normalator<cudf::size_type>;
  *                      thrust::less<Element>());
  * @endcode
  */
-using output_indexalator = output_normalator<cudf::size_type>;
+struct output_indexalator : base_normalator<output_indexalator, cudf::size_type> {
+  friend struct base_normalator<output_indexalator, cudf::size_type>;  // for CRTP
+
+  using reference = output_indexalator const&;  // required for output iterators
+
+  output_indexalator()                                     = default;
+  output_indexalator(output_indexalator const&)            = default;
+  output_indexalator(output_indexalator&&)                 = default;
+  output_indexalator& operator=(output_indexalator const&) = default;
+  output_indexalator& operator=(output_indexalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(Integer)` calls.
+   */
+  __device__ inline reference operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(Integer)` call in this class.
+   */
+  __device__ inline output_indexalator const operator[](size_type idx) const
+  {
+    output_indexalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Dispatch functor for setting the index value from a size_type value.
+   */
+  struct normalize_type {
+    template <typename T, CUDF_ENABLE_IF(cudf::is_index_type<T>())>
+    __device__ void operator()(void* tp, cudf::size_type const value)
+    {
+      (*static_cast<T*>(tp)) = static_cast<T>(value);
+    }
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_index_type<T>())>
+    __device__ void operator()(void*, cudf::size_type const)
+    {
+      CUDF_UNREACHABLE("only index types are supported");
+    }
+  };
+
+  /**
+   * @brief Assign an Integer value to the current iterator position
+   */
+  __device__ inline reference operator=(cudf::size_type const value) const
+  {
+    void* tp = p_;
+    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
+    return *this;
+  }
+
+  /**
+   * @brief Create an output normalizing iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE output_indexalator(void* data, data_type dtype)
+    : base_normalator<output_indexalator, cudf::size_type>(dtype), p_{static_cast<char*>(data)}
+  {
+  }
+
+ protected:
+  char* p_;  /// pointer to the integer data in device memory
+};
 
 /**
  * @brief Use this class to create an indexalator instance.
@@ -92,14 +219,12 @@ struct indexalator_factory {
    * @brief A type_dispatcher functor to create an input iterator from an indices column.
    */
   struct input_indexalator_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     input_indexalator operator()(column_view const& indices)
     {
       return input_indexalator(indices.data<IndexType>(), indices.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     input_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("indices must be an index type");
@@ -110,16 +235,14 @@ struct indexalator_factory {
    * @brief Use this class to create an indexalator to a scalar index.
    */
   struct input_indexalator_scalar_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     input_indexalator operator()(scalar const& index)
     {
       // note: using static_cast<scalar_type_t<IndexType> const&>(index) creates a copy
       auto const scalar_impl = static_cast<scalar_type_t<IndexType> const*>(&index);
       return input_indexalator(scalar_impl->data(), index.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     input_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("scalar must be an index type");
@@ -130,14 +253,12 @@ struct indexalator_factory {
    * @brief A type_dispatcher functor to create an output iterator from an indices column.
    */
   struct output_indexalator_fn {
-    template <typename IndexType, std::enable_if_t<is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, CUDF_ENABLE_IF(is_index_type<IndexType>())>
     output_indexalator operator()(mutable_column_view const& indices)
     {
       return output_indexalator(indices.data<IndexType>(), indices.type());
     }
-    template <typename IndexType,
-              typename... Args,
-              std::enable_if_t<not is_index_type<IndexType>()>* = nullptr>
+    template <typename IndexType, typename... Args, CUDF_ENABLE_IF(not is_index_type<IndexType>())>
     output_indexalator operator()(Args&&... args)
     {
       CUDF_FAIL("indices must be an index type");
diff --git a/cpp/include/cudf/detail/normalizing_iterator.cuh b/cpp/include/cudf/detail/normalizing_iterator.cuh
index 35a695d47df..8f90afc3e57 100644
--- a/cpp/include/cudf/detail/normalizing_iterator.cuh
+++ b/cpp/include/cudf/detail/normalizing_iterator.cuh
@@ -33,7 +33,7 @@ namespace detail {
  * @tparam Integer The type the iterator normalizes to
  */
 template <class Derived, typename Integer>
-struct base_normalator {
+struct alignas(16) base_normalator {
   static_assert(cudf::is_index_type<Integer>());
   using difference_type   = std::ptrdiff_t;
   using value_type        = Integer;
@@ -204,7 +204,7 @@ struct base_normalator {
 
  private:
   struct integer_sizeof_fn {
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
+    template <typename T, CUDF_ENABLE_IF(not cudf::is_fixed_width<T>())>
     CUDF_HOST_DEVICE constexpr std::size_t operator()() const
     {
 #ifndef __CUDA_ARCH__
@@ -213,7 +213,7 @@ struct base_normalator {
       CUDF_UNREACHABLE("only integral types are supported");
 #endif
     }
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
+    template <typename T, CUDF_ENABLE_IF(cudf::is_fixed_width<T>())>
     CUDF_HOST_DEVICE constexpr std::size_t operator()() const noexcept
     {
       return sizeof(T);
@@ -229,160 +229,16 @@ struct base_normalator {
     width_ = static_cast<int32_t>(type_dispatcher(dtype, integer_sizeof_fn{}));
   }
 
-  int32_t width_;    /// integer type width = 1,2,4, or 8
-  data_type dtype_;  /// for type-dispatcher calls
-};
-
-/**
- * @brief The integer normalizing input iterator
- *
- * This is an iterator that can be used for index types (integers) without
- * requiring a type-specific instance. It can be used for any iterator
- * interface for reading an array of integer values of type
- * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
- * Reading specific elements always return a type of `Integer`
- *
- * @tparam Integer Type returned by all read functions
- */
-template <typename Integer>
-struct input_normalator : base_normalator<input_normalator<Integer>, Integer> {
-  friend struct base_normalator<input_normalator<Integer>, Integer>;  // for CRTP
-
-  using reference = Integer const;  // this keeps STL and thrust happy
-
-  input_normalator()                                   = default;
-  input_normalator(input_normalator const&)            = default;
-  input_normalator(input_normalator&&)                 = default;
-  input_normalator& operator=(input_normalator const&) = default;
-  input_normalator& operator=(input_normalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns the value at the current iterator position
-   */
-  __device__ inline Integer operator*() const { return operator[](0); }
-
-  /**
-   * @brief Dispatch functor for resolving a Integer value from any integer type
-   */
-  struct normalize_type {
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
-    __device__ Integer operator()(void const* tp)
-    {
-      return static_cast<Integer>(*static_cast<T const*>(tp));
-    }
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
-    __device__ Integer operator()(void const*)
-    {
-      CUDF_UNREACHABLE("only integral types are supported");
-    }
-  };
-
   /**
-   * @brief Array subscript operator returns a value at the input
-   * `idx` position as a `Integer` value.
-   */
-  __device__ inline Integer operator[](size_type idx) const
-  {
-    void const* tp = p_ + (idx * this->width_);
-    return type_dispatcher(this->dtype_, normalize_type{}, tp);
-  }
-
-  /**
-   * @brief Create an input index normalizing iterator.
-   *
-   * Use the indexalator_factory to create an iterator instance.
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param data_type Type of data in data
-   */
-  CUDF_HOST_DEVICE input_normalator(void const* data, data_type dtype, cudf::size_type offset = 0)
-    : base_normalator<input_normalator<Integer>, Integer>(dtype), p_{static_cast<char const*>(data)}
-  {
-    p_ += offset * this->width_;
-  }
-
-  char const* p_;  /// pointer to the integer data in device memory
-};
-
-/**
- * @brief The integer normalizing output iterator
- *
- * This is an iterator that can be used for index types (integers) without
- * requiring a type-specific instance. It can be used for any iterator
- * interface for writing an array of integer values of type
- * int8, int16, int32, int64, uint8, uint16, uint32, or uint64.
- * Setting specific elements always accept the `Integer` type values.
- *
- * @tparam Integer The type used for all write functions
- */
-template <typename Integer>
-struct output_normalator : base_normalator<output_normalator<Integer>, Integer> {
-  friend struct base_normalator<output_normalator<Integer>, Integer>;  // for CRTP
-
-  using reference = output_normalator const&;  // required for output iterators
-
-  output_normalator()                                    = default;
-  output_normalator(output_normalator const&)            = default;
-  output_normalator(output_normalator&&)                 = default;
-  output_normalator& operator=(output_normalator const&) = default;
-  output_normalator& operator=(output_normalator&&)      = default;
-
-  /**
-   * @brief Indirection operator returns this iterator instance in order
-   * to capture the `operator=(Integer)` calls.
-   */
-  __device__ inline output_normalator const& operator*() const { return *this; }
-
-  /**
-   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
-   *
-   * This allows capturing the subsequent `operator=(Integer)` call in this class.
-   */
-  __device__ inline output_normalator const operator[](size_type idx) const
-  {
-    output_normalator tmp{*this};
-    tmp.p_ += (idx * this->width_);
-    return tmp;
-  }
-
-  /**
-   * @brief Dispatch functor for setting the index value from a size_type value.
-   */
-  struct normalize_type {
-    template <typename T, std::enable_if_t<cudf::is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void* tp, Integer const value)
-    {
-      (*static_cast<T*>(tp)) = static_cast<T>(value);
-    }
-    template <typename T, std::enable_if_t<not cudf::is_index_type<T>()>* = nullptr>
-    __device__ void operator()(void*, Integer const)
-    {
-      CUDF_UNREACHABLE("only index types are supported");
-    }
-  };
-
-  /**
-   * @brief Assign an Integer value to the current iterator position
-   */
-  __device__ inline output_normalator const& operator=(Integer const value) const
-  {
-    void* tp = p_;
-    type_dispatcher(this->dtype_, normalize_type{}, tp, value);
-    return *this;
-  }
-
-  /**
-   * @brief Create an output normalizing iterator
-   *
-   * @param data      Pointer to an integer array in device memory.
-   * @param data_type Type of data in data
+   * @brief Constructor assigns width and type member variables for base class.
    */
-  CUDF_HOST_DEVICE output_normalator(void* data, data_type dtype)
-    : base_normalator<output_normalator<Integer>, Integer>(dtype), p_{static_cast<char*>(data)}
+  explicit CUDF_HOST_DEVICE base_normalator(data_type dtype, int32_t width)
+    : width_(width), dtype_(dtype)
   {
   }
 
-  char* p_;  /// pointer to the integer data in device memory
+  int32_t width_;    /// integer type width = 1,2,4, or 8
+  data_type dtype_;  /// for type-dispatcher calls
 };
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/offsets_iterator.cuh b/cpp/include/cudf/detail/offsets_iterator.cuh
new file mode 100644
index 00000000000..3eb77b32353
--- /dev/null
+++ b/cpp/include/cudf/detail/offsets_iterator.cuh
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/detail/normalizing_iterator.cuh>
+#include <cudf/types.hpp>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief The offsets normalizing input iterator
+ *
+ * This is an iterator that can be used for offsets where the underlying
+ * type may be int32_t or int64_t.
+ *
+ * Use the offsetalator_factory to create an appropriate input iterator
+ * from an offsets column_view.
+ */
+struct input_offsetalator : base_normalator<input_offsetalator, int64_t> {
+  friend struct base_normalator<input_offsetalator, int64_t>;  // for CRTP
+
+  using reference = int64_t const;  // this keeps STL and thrust happy
+
+  input_offsetalator()                                     = default;
+  input_offsetalator(input_offsetalator const&)            = default;
+  input_offsetalator(input_offsetalator&&)                 = default;
+  input_offsetalator& operator=(input_offsetalator const&) = default;
+  input_offsetalator& operator=(input_offsetalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns the value at the current iterator position
+   */
+  __device__ inline int64_t operator*() const { return operator[](0); }
+
+  /**
+   * @brief Array subscript operator returns a value at the input
+   * `idx` position as a int64_t value.
+   */
+  __device__ inline int64_t operator[](size_type idx) const
+  {
+    void const* tp = p_ + (idx * this->width_);
+    return this->width_ == sizeof(int32_t) ? static_cast<int64_t>(*static_cast<int32_t const*>(tp))
+                                           : *static_cast<int64_t const*>(tp);
+  }
+
+  /**
+   * @brief Create an input index normalizing iterator.
+   *
+   * Use the indexalator_factory to create an iterator instance.
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE input_offsetalator(void const* data, data_type dtype)
+    : base_normalator<input_offsetalator, int64_t>(
+        dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
+      p_{static_cast<char const*>(data)}
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64,
+                 "Unexpected offsets type");
+#else
+    cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
+                "Unexpected offsets type");
+#endif
+  }
+
+ protected:
+  char const* p_;  /// pointer to the integer data in device memory
+};
+
+/**
+ * @brief The offsets normalizing output iterator
+ *
+ * This is an iterator that can be used for storing offsets values
+ * where the underlying type may be either int32_t or int64_t.
+ *
+ * Use the offsetalator_factory to create an appropriate output iterator
+ * from a mutable_column_view.
+ *
+ */
+struct output_offsetalator : base_normalator<output_offsetalator, int64_t> {
+  friend struct base_normalator<output_offsetalator, int64_t>;  // for CRTP
+
+  using reference = output_offsetalator const&;  // required for output iterators
+
+  output_offsetalator()                                      = default;
+  output_offsetalator(output_offsetalator const&)            = default;
+  output_offsetalator(output_offsetalator&&)                 = default;
+  output_offsetalator& operator=(output_offsetalator const&) = default;
+  output_offsetalator& operator=(output_offsetalator&&)      = default;
+
+  /**
+   * @brief Indirection operator returns this iterator instance in order
+   * to capture the `operator=(int64)` calls.
+   */
+  __device__ inline output_offsetalator const& operator*() const { return *this; }
+
+  /**
+   * @brief Array subscript operator returns an iterator instance at the specified `idx` position.
+   *
+   * This allows capturing the subsequent `operator=(int64)` call in this class.
+   */
+  __device__ inline output_offsetalator const operator[](size_type idx) const
+  {
+    output_offsetalator tmp{*this};
+    tmp.p_ += (idx * this->width_);
+    return tmp;
+  }
+
+  /**
+   * @brief Assign an offset value to the current iterator position
+   */
+  __device__ inline output_offsetalator const& operator=(int64_t const value) const
+  {
+    void* tp = p_;
+    if (this->width_ == sizeof(int32_t)) {
+      (*static_cast<int32_t*>(tp)) = static_cast<int32_t>(value);
+    } else {
+      (*static_cast<int64_t*>(tp)) = value;
+    }
+    return *this;
+  }
+
+  /**
+   * @brief Create an output offsets iterator
+   *
+   * @param data      Pointer to an integer array in device memory.
+   * @param dtype Type of data in data
+   */
+  CUDF_HOST_DEVICE output_offsetalator(void* data, data_type dtype)
+    : base_normalator<output_offsetalator, int64_t>(
+        dtype, dtype.id() == type_id::INT32 ? sizeof(int32_t) : sizeof(int64_t)),
+      p_{static_cast<char*>(data)}
+  {
+#ifndef __CUDA_ARCH__
+    CUDF_EXPECTS(dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64,
+                 "Unexpected offsets type");
+#else
+    cudf_assert((dtype.id() == type_id::INT32 || dtype.id() == type_id::INT64) &&
+                "Unexpected offsets type");
+#endif
+  }
+
+ protected:
+  char* p_;  /// pointer to the integer data in device memory
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/detail/offsets_iterator_factory.cuh b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
new file mode 100644
index 00000000000..5b4c6b825d2
--- /dev/null
+++ b/cpp/include/cudf/detail/offsets_iterator_factory.cuh
@@ -0,0 +1,47 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/detail/offsets_iterator.cuh>
+
+namespace cudf {
+namespace detail {
+
+/**
+ * @brief Use this class to create an offsetalator instance.
+ */
+struct offsetalator_factory {
+  /**
+   * @brief Create an input offsetalator instance from an offsets column
+   */
+  static input_offsetalator make_input_iterator(column_view const& offsets)
+  {
+    return input_offsetalator(offsets.head(), offsets.type());
+  }
+
+  /**
+   * @brief Create an output offsetalator instance from an offsets column
+   */
+  static output_offsetalator make_output_iterator(mutable_column_view const& offsets)
+  {
+    return output_offsetalator(offsets.head(), offsets.type());
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index ac885c54356..435583e805d 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1307,6 +1307,7 @@ class csv_reader_options_builder {
  * @endcode
  *
  * @param options Settings for controlling reading behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata
  *
@@ -1314,6 +1315,7 @@ class csv_reader_options_builder {
  */
 table_with_metadata read_csv(
   csv_reader_options options,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
@@ -1715,9 +1717,11 @@ class csv_writer_options_builder {
  * @endcode
  *
  * @param options Settings for controlling writing behavior
+ * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(csv_writer_options const& options,
+               rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 9fdc7a47fb9..40ddcf385b0 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -17,7 +17,6 @@
 #pragma once
 
 #include <cudf/io/csv.hpp>
-#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index 82d59803c25..9531a012e49 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -141,10 +141,10 @@ using binary_statistics = sum_statistics<int64_t>;
  * the UNIX epoch. The `minimum_utc` and `maximum_utc` are the same values adjusted to UTC.
  */
 struct timestamp_statistics : minmax_statistics<int64_t> {
-  std::optional<int64_t> minimum_utc;    ///< minimum in milliseconds
-  std::optional<int64_t> maximum_utc;    ///< maximum in milliseconds
-  std::optional<int32_t> minimum_nanos;  ///< nanoseconds part of the minimum
-  std::optional<int32_t> maximum_nanos;  ///< nanoseconds part of the maximum
+  std::optional<int64_t> minimum_utc;     ///< minimum in milliseconds
+  std::optional<int64_t> maximum_utc;     ///< maximum in milliseconds
+  std::optional<uint32_t> minimum_nanos;  ///< nanoseconds part of the minimum
+  std::optional<uint32_t> maximum_nanos;  ///< nanoseconds part of the maximum
 };
 
 namespace orc {
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index e94dfea9dcf..b9f2e0d9868 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -803,7 +803,8 @@ class strings_column_wrapper : public detail::column_wrapper {
       offsets, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
     auto d_bitmask = cudf::detail::make_device_uvector_sync(
       null_mask, cudf::test::get_default_stream(), rmm::mr::get_current_device_resource());
-    wrapped = cudf::make_strings_column(d_chars, d_offsets, d_bitmask, null_count);
+    wrapped = cudf::make_strings_column(
+      d_chars, d_offsets, d_bitmask, null_count, cudf::test::get_default_stream());
   }
 
   /**
@@ -1846,7 +1847,8 @@ class structs_column_wrapper : public detail::column_wrapper {
                    child_column_wrappers.end(),
                    std::back_inserter(child_columns),
                    [&](auto const& column_wrapper) {
-                     return std::make_unique<cudf::column>(column_wrapper.get());
+                     return std::make_unique<cudf::column>(column_wrapper.get(),
+                                                           cudf::test::get_default_stream());
                    });
     init(std::move(child_columns), validity);
   }
@@ -1882,7 +1884,8 @@ class structs_column_wrapper : public detail::column_wrapper {
                    child_column_wrappers.end(),
                    std::back_inserter(child_columns),
                    [&](auto const& column_wrapper) {
-                     return std::make_unique<cudf::column>(column_wrapper.get());
+                     return std::make_unique<cudf::column>(column_wrapper.get(),
+                                                           cudf::test::get_default_stream());
                    });
     init(std::move(child_columns), validity_iter);
   }
@@ -1906,8 +1909,11 @@ class structs_column_wrapper : public detail::column_wrapper {
       return cudf::test::detail::make_null_mask(validity.begin(), validity.end());
     }();
 
-    wrapped = cudf::make_structs_column(
-      num_rows, std::move(child_columns), null_count, std::move(null_mask));
+    wrapped = cudf::make_structs_column(num_rows,
+                                        std::move(child_columns),
+                                        null_count,
+                                        std::move(null_mask),
+                                        cudf::test::get_default_stream());
   }
 
   template <typename V>
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index ec7902d8e28..e31f6bd4096 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -21,7 +21,7 @@ include(rapids-export)
 include(rapids-find)
 
 project(
-  CUDA_KAFKA
+  CUDF_KAFKA
   VERSION 24.02.00
   LANGUAGES CXX
 )
@@ -64,7 +64,7 @@ add_library(cudf_kafka SHARED src/kafka_consumer.cpp src/kafka_callback.cpp)
 # ##################################################################################################
 # * include paths ---------------------------------------------------------------------------------
 target_include_directories(
-  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDA_KAFKA_SOURCE_DIR}/include>"
+  cudf_kafka PUBLIC "$<BUILD_INTERFACE:${CUDF_KAFKA_SOURCE_DIR}/include>"
                     "$<INSTALL_INTERFACE:include>"
 )
 
@@ -85,6 +85,8 @@ set_target_properties(
              CXX_STANDARD_REQUIRED ON
 )
 
+add_library(cudf_kafka::cudf_kafka ALIAS cudf_kafka)
+
 # ##################################################################################################
 # * cudf_kafka Install ----------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)
@@ -94,7 +96,7 @@ install(
   EXPORT cudf_kafka-exports
 )
 
-install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include DESTINATION include)
+install(DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/include/ DESTINATION ${CMAKE_INSTALL_INCLUDEDIR})
 
 rapids_export(
   INSTALL cudf_kafka
diff --git a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
index aa4c5b60e7a..20aa9873f43 100644
--- a/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
+++ b/cpp/libcudf_kafka/cmake/thirdparty/get_cudf.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+# Copyright (c) 2021-2023, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -35,21 +35,21 @@ function(find_and_configure_cudf VERSION)
   endif()
 endfunction()
 
-set(CUDA_KAFKA_MIN_VERSION_cudf
-    "${CUDA_KAFKA_VERSION_MAJOR}.${CUDA_KAFKA_VERSION_MINOR}.${CUDA_KAFKA_VERSION_PATCH}"
+set(CUDF_KAFKA_MIN_VERSION
+    "${CUDF_KAFKA_VERSION_MAJOR}.${CUDF_KAFKA_VERSION_MINOR}.${CUDF_KAFKA_VERSION_PATCH}"
 )
-find_and_configure_cudf(${CUDA_KAFKA_MIN_VERSION_cudf})
+find_and_configure_cudf(${CUDF_KAFKA_MIN_VERSION})
 
 if(cudf_REQUIRES_CUDA)
-  rapids_cuda_init_architectures(CUDA_KAFKA)
+  rapids_cuda_init_architectures(CUDF_KAFKA)
 
   # Since we are building cudf as part of ourselves we need to enable the CUDA language in the
   # top-most scope
   enable_language(CUDA)
 
-  # Since CUDA_KAFKA only enables CUDA optionally we need to manually include the file that
+  # Since CUDF_KAFKA only enables CUDA optionally we need to manually include the file that
   # rapids_cuda_init_architectures relies on `project` calling
-  if(DEFINED CMAKE_PROJECT_CUDA_KAFKA_INCLUDE)
-    include("${CMAKE_PROJECT_CUDA_KAFKA_INCLUDE}")
+  if(DEFINED CMAKE_PROJECT_CUDF_KAFKA_INCLUDE)
+    include("${CMAKE_PROJECT_CUDF_KAFKA_INCLUDE}")
   endif()
 endif()
diff --git a/cpp/libcudf_kafka/tests/CMakeLists.txt b/cpp/libcudf_kafka/tests/CMakeLists.txt
index 68a5327b455..b819cb6fc3b 100644
--- a/cpp/libcudf_kafka/tests/CMakeLists.txt
+++ b/cpp/libcudf_kafka/tests/CMakeLists.txt
@@ -26,7 +26,7 @@ function(ConfigureTest test_name)
   add_executable(${test_name} ${ARGN})
   set_target_properties(
     ${test_name}
-    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDA_KAFKA_BINARY_DIR}/gtests>"
+    PROPERTIES RUNTIME_OUTPUT_DIRECTORY "$<BUILD_INTERFACE:${CUDF_KAFKA_BINARY_DIR}/gtests>"
                INSTALL_RPATH "\$ORIGIN/../../../lib"
   )
   target_link_libraries(
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 8c586306ad5..6e9c634804c 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -146,6 +146,12 @@ struct column_to_strings_fn {
   {
   }
 
+  ~column_to_strings_fn()                                      = default;
+  column_to_strings_fn(column_to_strings_fn const&)            = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn const&) = delete;
+  column_to_strings_fn(column_to_strings_fn&&)                 = delete;
+  column_to_strings_fn& operator=(column_to_strings_fn&&)      = delete;
+
   // Note: `null` replacement with `na_rep` deferred to `concatenate()`
   // instead of column-wise; might be faster
   //
@@ -160,8 +166,9 @@ struct column_to_strings_fn {
   std::enable_if_t<std::is_same_v<column_type, bool>, std::unique_ptr<column>> operator()(
     column_view const& column) const
   {
-    return cudf::strings::detail::from_booleans(
-      column, options_.get_true_value(), options_.get_false_value(), stream_, mr_);
+    string_scalar true_string{options_.get_true_value(), true, stream_};
+    string_scalar false_string{options_.get_false_value(), true, stream_};
+    return cudf::strings::detail::from_booleans(column, true_string, false_string, stream_, mr_);
   }
 
   // strings:
@@ -367,10 +374,10 @@ void write_chunked(data_sink* out_sink,
 
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
-  cudf::string_scalar newline{options.get_line_terminator()};
+  cudf::string_scalar newline{options.get_line_terminator(), true, stream};
   auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
                                                             newline,
-                                                            string_scalar("", false),
+                                                            string_scalar{"", false, stream},
                                                             stream,
                                                             rmm::mr::get_current_device_resource());
   strings_column_view strings_column{p_str_col_w_nl->view()};
@@ -455,12 +462,14 @@ void write_csv(data_sink* out_sink,
 
       // populate vector of string-converted columns:
       //
-      std::transform(sub_view.begin(),
-                     sub_view.end(),
-                     std::back_inserter(str_column_vec),
-                     [converter](auto const& current_col) {
-                       return cudf::type_dispatcher(current_col.type(), converter, current_col);
-                     });
+      std::transform(
+        sub_view.begin(),
+        sub_view.end(),
+        std::back_inserter(str_column_vec),
+        [&converter = std::as_const(converter)](auto const& current_col) {
+          return cudf::type_dispatcher<cudf::id_to_type_impl, column_to_strings_fn const&>(
+            current_col.type(), converter, current_col);
+        });
 
       // create string table view from str_column_vec:
       //
@@ -470,18 +479,19 @@ void write_csv(data_sink* out_sink,
       // concatenate columns in each row into one big string column
       // (using null representation and delimiter):
       //
-      std::string delimiter_str{options.get_inter_column_delimiter()};
       auto str_concat_col = [&] {
+        cudf::string_scalar delimiter_str{
+          std::string{options.get_inter_column_delimiter()}, true, stream};
+        cudf::string_scalar options_narep{options.get_na_rep(), true, stream};
         if (str_table_view.num_columns() > 1)
           return cudf::strings::detail::concatenate(str_table_view,
                                                     delimiter_str,
-                                                    options.get_na_rep(),
+                                                    options_narep,
                                                     strings::separator_on_nulls::YES,
                                                     stream,
                                                     rmm::mr::get_current_device_resource());
-        cudf::string_scalar narep{options.get_na_rep()};
         return cudf::strings::detail::replace_nulls(
-          str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource());
+          str_table_view.column(0), options_narep, stream, rmm::mr::get_current_device_resource());
       }();
 
       write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index 00d56008611..964e40e36cd 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -230,7 +230,9 @@ void write_json(json_writer_options const& options,
     mr);
 }
 
-table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
+table_with_metadata read_csv(csv_reader_options options,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
@@ -245,12 +247,14 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_
   return cudf::io::detail::csv::read_csv(  //
     std::move(datasources[0]),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
 // Freeform API wraps the detail writer class API
-void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resource* mr)
+void write_csv(csv_writer_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
 {
   using namespace cudf::io::detail;
 
@@ -262,7 +266,7 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
     options.get_table(),
     options.get_names(),
     options,
-    cudf::get_default_stream(),
+    stream,
     mr);
 }
 
diff --git a/cpp/src/io/orc/orc.cpp b/cpp/src/io/orc/orc.cpp
index bc399b75ef9..ee5fa4e8b5a 100644
--- a/cpp/src/io/orc/orc.cpp
+++ b/cpp/src/io/orc/orc.cpp
@@ -182,6 +182,19 @@ void ProtobufReader::read(timestamp_statistics& s, size_t maxlen)
                        field_reader(5, s.minimum_nanos),
                        field_reader(6, s.maximum_nanos));
   function_builder(s, maxlen, op);
+
+  // Adjust nanoseconds because they are encoded as (value + 1)
+  // Range [1, 1000'000] is translated here to [0, 999'999]
+  if (s.minimum_nanos.has_value()) {
+    auto& min_nanos = s.minimum_nanos.value();
+    CUDF_EXPECTS(min_nanos >= 1 and min_nanos <= 1000'000, "Invalid minimum nanoseconds");
+    --min_nanos;
+  }
+  if (s.maximum_nanos.has_value()) {
+    auto& max_nanos = s.maximum_nanos.value();
+    CUDF_EXPECTS(max_nanos >= 1 and max_nanos <= 1000'000, "Invalid maximum nanoseconds");
+    --max_nanos;
+  }
 }
 
 void ProtobufReader::read(column_statistics& s, size_t maxlen)
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 6f65e384d2d..783ed4206b6 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -41,6 +41,12 @@ static constexpr uint32_t block_header_size = 3;
 // Seconds from January 1st, 1970 to January 1st, 2015
 static constexpr int64_t orc_utc_epoch = 1420070400;
 
+// Used for the nanosecond remainder in timestamp statistics when the actual nanoseconds of min/max
+// are not included. As the timestamp statistics are stored as milliseconds + nanosecond remainder,
+// the maximum nanosecond remainder is 999,999 (nanoseconds in a millisecond - 1).
+static constexpr int32_t DEFAULT_MIN_NANOS = 0;
+static constexpr int32_t DEFAULT_MAX_NANOS = 999'999;
+
 struct PostScript {
   uint64_t footerLength       = 0;     // the length of the footer section in bytes
   CompressionKind compression = NONE;  // the kind of generic compression used
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index 479a2dfada3..429fd5b929d 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -27,6 +27,10 @@ namespace cudf::io::orc::gpu {
 
 using strings::detail::fixed_point_string_size;
 
+// Nanosecond statistics should not be enabled until the spec version is set correctly in the output
+// files. See https://github.com/rapidsai/cudf/issues/14325 for more details
+constexpr bool enable_nanosecond_statistics = false;
+
 constexpr unsigned int init_threads_per_group = 32;
 constexpr unsigned int init_groups_per_block  = 4;
 constexpr unsigned int init_threads_per_block = init_threads_per_group * init_groups_per_block;
@@ -96,8 +100,10 @@ __global__ void __launch_bounds__(block_size, 1)
           stats_len = pb_fldlen_common + pb_fld_hdrlen + 2 * (pb_fld_hdrlen + pb_fldlen_int64);
           break;
         case dtype_timestamp64:
-          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64) +
-                      2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          stats_len = pb_fldlen_common + pb_fld_hdrlen + 4 * (pb_fld_hdrlen + pb_fldlen_int64);
+          if constexpr (enable_nanosecond_statistics) {
+            stats_len += 2 * (pb_fld_hdrlen + pb_fldlen_int32);
+          }
           break;
         case dtype_float32:
         case dtype_float64:
@@ -405,7 +411,8 @@ __global__ void __launch_bounds__(encode_threads_per_block)
         //  optional sint64 minimumUtc = 3; // min,max values saved as milliseconds since UNIX epoch
         //  optional sint64 maximumUtc = 4;
         //  optional int32 minimumNanos = 5; // lower 6 TS digits for min/max to achieve nanosecond
-        //  precision optional int32 maximumNanos = 6;
+        //  precision
+        // optional int32 maximumNanos = 6;
         // }
         if (s->chunk.has_minmax) {
           cur[0] = 9 * 8 + ProtofType::FIXEDLEN;
@@ -416,12 +423,22 @@ __global__ void __launch_bounds__(encode_threads_per_block)
             split_nanosecond_timestamp(s->chunk.max_value.i_val);
 
           // minimum/maximum are the same as minimumUtc/maximumUtc as we always write files in UTC
-          cur          = pb_put_int(cur, 1, min_ms);            // minimum
-          cur          = pb_put_int(cur, 2, max_ms);            // maximum
-          cur          = pb_put_int(cur, 3, min_ms);            // minimumUtc
-          cur          = pb_put_int(cur, 4, max_ms);            // maximumUtc
-          cur          = pb_put_int(cur, 5, min_ns_remainder);  // minimumNanos
-          cur          = pb_put_int(cur, 6, max_ns_remainder);  // maximumNanos
+          cur = pb_put_int(cur, 1, min_ms);  // minimum
+          cur = pb_put_int(cur, 2, max_ms);  // maximum
+          cur = pb_put_int(cur, 3, min_ms);  // minimumUtc
+          cur = pb_put_int(cur, 4, max_ms);  // maximumUtc
+
+          if constexpr (enable_nanosecond_statistics) {
+            if (min_ns_remainder != DEFAULT_MIN_NANOS) {
+              // using uint because positive values are not zigzag encoded
+              cur = pb_put_uint(cur, 5, min_ns_remainder + 1);  // minimumNanos
+            }
+            if (max_ns_remainder != DEFAULT_MAX_NANOS) {
+              // using uint because positive values are not zigzag encoded
+              cur = pb_put_uint(cur, 6, max_ns_remainder + 1);  // maximumNanos
+            }
+          }
+
           fld_start[1] = cur - (fld_start + 2);
         }
         break;
diff --git a/cpp/src/text/vocabulary_tokenize.cu b/cpp/src/text/vocabulary_tokenize.cu
index 41f8c0a8731..511f1995374 100644
--- a/cpp/src/text/vocabulary_tokenize.cu
+++ b/cpp/src/text/vocabulary_tokenize.cu
@@ -276,8 +276,12 @@ __global__ void token_counts_fn(cudf::column_device_view const d_strings,
   __syncwarp();
 
   for (auto itr = d_output + lane_idx + 1; itr < d_output_end; itr += cudf::detail::warp_size) {
-    // add one if at the edge of a token or at the string's end
-    count += ((*itr && !(*(itr - 1))) || (itr + 1 == d_output_end));
+    // add one if at the edge of a token or if at the string's end
+    if (*itr) {
+      count += !(*(itr - 1));
+    } else {
+      count += (itr + 1 == d_output_end);
+    }
   }
   __syncwarp();
 
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index b0382d15807..1be8566fb0f 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -393,6 +393,7 @@ set_tests_properties(
 ConfigureTest(
   ITERATOR_TEST
   iterator/indexalator_test.cu
+  iterator/offsetalator_test.cu
   iterator/optional_iterator_test_chrono.cu
   iterator/optional_iterator_test_numeric.cu
   iterator/pair_iterator_test_chrono.cu
@@ -634,6 +635,7 @@ ConfigureTest(
 ConfigureTest(STREAM_BINARYOP_TEST streams/binaryop_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_CONCATENATE_TEST streams/concatenate_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_COPYING_TEST streams/copying_test.cpp STREAM_MODE testing)
+ConfigureTest(STREAM_CSVIO_TEST streams/io/csv_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_DICTIONARY_TEST streams/dictionary_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_FILLING_TEST streams/filling_test.cpp STREAM_MODE testing)
 ConfigureTest(STREAM_GROUPBY_TEST streams/groupby_test.cpp STREAM_MODE testing)
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index 234716749ff..dca3886db14 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -1054,8 +1054,12 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts4.maximum, 3);
     EXPECT_EQ(*ts4.minimum_utc, -4);
     EXPECT_EQ(*ts4.maximum_utc, 3);
-    EXPECT_EQ(*ts4.minimum_nanos, 999994);
-    EXPECT_EQ(*ts4.maximum_nanos, 6);
+    // nanosecond precision can't be included until we write a writer version that includes ORC-135
+    // see https://github.com/rapidsai/cudf/issues/14325
+    // EXPECT_EQ(*ts4.minimum_nanos, 999994);
+    EXPECT_FALSE(ts4.minimum_nanos.has_value());
+    // EXPECT_EQ(*ts4.maximum_nanos, 6);
+    EXPECT_FALSE(ts4.maximum_nanos.has_value());
 
     auto& s5 = stats[5];
     EXPECT_EQ(*s5.number_of_values, 4ul);
@@ -1065,8 +1069,12 @@ TEST_F(OrcStatisticsTest, Basic)
     EXPECT_EQ(*ts5.maximum, 3000);
     EXPECT_EQ(*ts5.minimum_utc, -3001);
     EXPECT_EQ(*ts5.maximum_utc, 3000);
-    EXPECT_EQ(*ts5.minimum_nanos, 994000);
-    EXPECT_EQ(*ts5.maximum_nanos, 6000);
+    // nanosecond precision can't be included until we write a writer version that includes ORC-135
+    // see https://github.com/rapidsai/cudf/issues/14325
+    // EXPECT_EQ(*ts5.minimum_nanos, 994000);
+    EXPECT_FALSE(ts5.minimum_nanos.has_value());
+    // EXPECT_EQ(*ts5.maximum_nanos, 6000);
+    EXPECT_FALSE(ts5.maximum_nanos.has_value());
 
     auto& s6 = stats[6];
     EXPECT_EQ(*s6.number_of_values, 4ul);
diff --git a/cpp/tests/iterator/indexalator_test.cu b/cpp/tests/iterator/indexalator_test.cu
index 3e8bcd5cb0d..0c10853ec02 100644
--- a/cpp/tests/iterator/indexalator_test.cu
+++ b/cpp/tests/iterator/indexalator_test.cu
@@ -157,40 +157,3 @@ TYPED_TEST(IndexalatorTest, output_iterator)
   expected = cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 1, 1, 2, 3, 4, 5, 5, 7});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
 }
-
-/**
- * For testing creating and using the indexalator in device code.
- */
-struct device_functor_fn {
-  cudf::column_device_view const d_col;
-  __device__ cudf::size_type operator()(cudf::size_type idx)
-  {
-    auto itr = cudf::detail::input_indexalator(d_col.head(), d_col.type());
-    return itr[idx] * 3;
-  }
-};
-
-TYPED_TEST(IndexalatorTest, device_indexalator)
-{
-  using T = TypeParam;
-
-  auto d_col1 =
-    cudf::test::fixed_width_column_wrapper<T, int32_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
-  auto d_col2 =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 0, 0, 0, 0, 0, 0, 0, 0});
-  auto input  = cudf::column_view(d_col1);
-  auto output = cudf::mutable_column_view(d_col2);
-  auto stream = cudf::get_default_stream();
-
-  auto d_input = cudf::column_device_view::create(input, stream);
-
-  thrust::transform(rmm::exec_policy(stream),
-                    thrust::counting_iterator<int>(0),
-                    thrust::counting_iterator<int>(input.size()),
-                    output.begin<cudf::size_type>(),
-                    device_functor_fn{*d_input});
-
-  auto expected =
-    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 18, 21, 42, 69, 99, 129, 135, 189});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
-}
diff --git a/cpp/tests/iterator/offsetalator_test.cu b/cpp/tests/iterator/offsetalator_test.cu
new file mode 100644
index 00000000000..e569e58f42a
--- /dev/null
+++ b/cpp/tests/iterator/offsetalator_test.cu
@@ -0,0 +1,140 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software distributed under the License
+ * is distributed on an "AS IS" BASIS,  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+ * or implied. See the License for the specific language governing permissions and limitations under
+ * the License.
+ */
+
+#include <tests/iterator/iterator_tests.cuh>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/detail/offsets_iterator_factory.cuh>
+
+#include <thrust/binary_search.h>
+#include <thrust/gather.h>
+#include <thrust/host_vector.h>
+#include <thrust/optional.h>
+#include <thrust/pair.h>
+#include <thrust/scatter.h>
+#include <thrust/sequence.h>
+#include <thrust/transform.h>
+
+using TestingTypes = cudf::test::Types<int32_t, int64_t>;
+
+template <typename T>
+struct OffsetalatorTest : public IteratorTest<T> {};
+
+TYPED_TEST_SUITE(OffsetalatorTest, TestingTypes);
+
+TYPED_TEST(OffsetalatorTest, input_iterator)
+{
+  using T = TypeParam;
+
+  auto host_values = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
+
+  auto d_col = cudf::test::fixed_width_column_wrapper<T>(host_values.begin(), host_values.end());
+
+  auto expected_values = thrust::host_vector<cudf::size_type>(host_values.size());
+  std::transform(host_values.begin(), host_values.end(), expected_values.begin(), [](auto v) {
+    return static_cast<cudf::size_type>(v);
+  });
+
+  auto it_dev = cudf::detail::offsetalator_factory::make_input_iterator(d_col);
+  this->iterator_test_thrust(expected_values, it_dev, host_values.size());
+}
+
+TYPED_TEST(OffsetalatorTest, output_iterator)
+{
+  using T = TypeParam;
+
+  auto d_col1 = cudf::test::fixed_width_column_wrapper<int64_t>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 = cudf::test::fixed_width_column_wrapper<T>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto itr    = cudf::detail::offsetalator_factory::make_output_iterator(d_col2);
+  auto input  = cudf::column_view(d_col1);
+  auto stream = cudf::get_default_stream();
+
+  auto map   = cudf::test::fixed_width_column_wrapper<int>({0, 2, 4, 6, 8, 1, 3, 5, 7});
+  auto d_map = cudf::column_view(map);
+  thrust::gather(rmm::exec_policy_nosync(stream),
+                 d_map.begin<int>(),
+                 d_map.end<int>(),
+                 input.begin<int64_t>(),
+                 itr);
+  auto expected = cudf::test::fixed_width_column_wrapper<T>({0, 7, 23, 43, 63, 6, 14, 33, 45});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::scatter(rmm::exec_policy_nosync(stream),
+                  input.begin<int64_t>(),
+                  input.end<int64_t>(),
+                  d_map.begin<int>(),
+                  itr);
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 33, 6, 43, 7, 45, 14, 63, 23});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::fill(rmm::exec_policy(stream), itr, itr + input.size(), 77);
+  expected = cudf::test::fixed_width_column_wrapper<T>({77, 77, 77, 77, 77, 77, 77, 77, 77});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  thrust::sequence(rmm::exec_policy(stream), itr, itr + input.size());
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 1, 2, 3, 4, 5, 6, 7, 8});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<int64_t>({0, 10, 20, 30, 40, 50, 60, 70, 80});
+  auto d_offsets = cudf::column_view(offsets);
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      d_offsets.begin<int64_t>(),
+                      d_offsets.end<int64_t>(),
+                      input.begin<int64_t>(),
+                      input.end<int64_t>(),
+                      itr);
+  expected = cudf::test::fixed_width_column_wrapper<T>({0, 1, 1, 2, 3, 4, 5, 5, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}
+
+namespace {
+/**
+ * For testing creating and using the offsetalator in device code.
+ */
+struct device_functor_fn {
+  cudf::column_device_view const d_col;
+  __device__ int32_t operator()(int idx)
+  {
+    auto const itr = cudf::detail::input_offsetalator(d_col.head(), d_col.type());
+    return static_cast<int32_t>(itr[idx] * 3);
+  }
+};
+}  // namespace
+
+TYPED_TEST(OffsetalatorTest, device_offsetalator)
+{
+  using T = TypeParam;
+
+  auto d_col1 = cudf::test::fixed_width_column_wrapper<T>({0, 6, 7, 14, 23, 33, 43, 45, 63});
+  auto d_col2 = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 0, 0, 0, 0, 0, 0, 0});
+  auto input  = cudf::column_view(d_col1);
+  auto output = cudf::mutable_column_view(d_col2);
+  auto stream = cudf::get_default_stream();
+
+  auto d_input = cudf::column_device_view::create(input, stream);
+
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::counting_iterator<int>(0),
+                    thrust::counting_iterator<int>(input.size()),
+                    output.begin<int32_t>(),
+                    device_functor_fn{*d_input});
+
+  auto expected =
+    cudf::test::fixed_width_column_wrapper<int32_t>({0, 18, 21, 42, 69, 99, 129, 135, 189});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(d_col2, expected);
+}
diff --git a/cpp/tests/streams/io/csv_test.cpp b/cpp/tests/streams/io/csv_test.cpp
new file mode 100644
index 00000000000..88514fa412c
--- /dev/null
+++ b/cpp/tests/streams/io/csv_test.cpp
@@ -0,0 +1,102 @@
+/*
+ * Copyright (c) 2023, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/io/csv.hpp>
+#include <cudf/io/detail/csv.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/default_stream.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <string>
+#include <vector>
+
+auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+class CSVTest : public cudf::test::BaseFixture {};
+
+TEST_F(CSVTest, CSVWriter)
+{
+  constexpr auto num_rows = 10;
+
+  std::vector<size_t> zeros(num_rows, 0);
+  std::vector<size_t> ones(num_rows, 1);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
+  });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+
+  std::vector<std::string> col8_data(num_rows, "rapids");
+  cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
+
+  cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv";
+  auto w_options      = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab)
+                     .include_header(false)
+                     .inter_column_delimiter(',');
+  cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream());
+}
+
+TEST_F(CSVTest, CSVReader)
+{
+  constexpr auto num_rows = 10;
+
+  std::vector<size_t> zeros(num_rows, 0);
+  std::vector<size_t> ones(num_rows, 1);
+  auto col6_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{12}};
+  });
+  auto col7_data = cudf::detail::make_counting_transform_iterator(0, [&](auto i) {
+    return numeric::decimal128{ones[i], numeric::scale_type{-12}};
+  });
+
+  cudf::test::fixed_width_column_wrapper<bool> col0(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int8_t> col1(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int16_t> col2(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<int32_t> col3(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<float> col4(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<double> col5(zeros.begin(), zeros.end());
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col6(col6_data, col6_data + num_rows);
+  cudf::test::fixed_width_column_wrapper<numeric::decimal128> col7(col7_data, col7_data + num_rows);
+
+  std::vector<std::string> col8_data(num_rows, "rapids");
+  cudf::test::strings_column_wrapper col8(col8_data.begin(), col8_data.end());
+
+  cudf::table_view tab({col0, col1, col2, col3, col4, col5, col6, col7, col8});
+
+  auto const filepath = temp_env->get_temp_dir() + "multicolumn.csv";
+  auto w_options      = cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, tab)
+                     .include_header(false)
+                     .inter_column_delimiter(',');
+  cudf::io::write_csv(w_options.build(), cudf::test::get_default_stream());
+}
diff --git a/cpp/tests/text/tokenize_tests.cpp b/cpp/tests/text/tokenize_tests.cpp
index 8118183a458..ea36e13de6f 100644
--- a/cpp/tests/text/tokenize_tests.cpp
+++ b/cpp/tests/text/tokenize_tests.cpp
@@ -246,14 +246,14 @@ TEST_F(TextTokenizeTest, Vocabulary)
 
 TEST_F(TextTokenizeTest, VocabularyLongStrings)
 {
-  cudf::test::strings_column_wrapper vocabulary(  // leaving out 'cat' on purpose
+  cudf::test::strings_column_wrapper vocabulary(
     {"ate", "chased", "cheese", "dog", "fox", "jumped", "mouse", "mousé", "over", "the"});
   auto vocab = nvtext::load_vocabulary(cudf::strings_column_view(vocabulary));
 
   std::vector<std::string> h_strings(
     4,
     "the fox jumped chased the dog cheese mouse at the over there dog mouse cat plus the horse "
-    "jumped over the mouse house with the dog");
+    "jumped  over  the mousé  house with the dog  ");
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
   auto input_view = cudf::strings_column_view(input);
   auto delimiter  = cudf::string_scalar(" ");
@@ -262,10 +262,10 @@ TEST_F(TextTokenizeTest, VocabularyLongStrings)
 
   using LCW = cudf::test::lists_column_wrapper<cudf::size_type>;
   // clang-format off
-  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3},
-                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 6, -1, -1, 9, 3}});
+  LCW expected({LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3},
+                LCW{ 9, 4, 5, 1, 9, 3, 2, 6, -1, 9, 8, -1, 3, 6, -1, -1, 9, -1, 5, 8, 9, 7, -1, -1, 9, 3}});
   // clang-format on
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
diff --git a/dependencies.yaml b/dependencies.yaml
index 9b350be3af2..f00273a5db1 100644
--- a/dependencies.yaml
+++ b/dependencies.yaml
@@ -9,8 +9,8 @@ files:
       - build_all
       - build_cpp
       - build_wheels
-      - build_python
       - build_python_common
+      - build_python_cudf
       - cudatoolkit
       - develop
       - docs
@@ -71,8 +71,8 @@ files:
       table: build-system
     includes:
       - build_all
-      - build_python
       - build_python_common
+      - build_python_cudf
       - build_wheels
   py_run_cudf:
     output: pyproject
@@ -138,8 +138,8 @@ files:
     extras:
       table: build-system
     includes:
-      - build_wheels
       - build_python_common
+      - build_wheels
   py_run_cudf_kafka:
     output: pyproject
     pyproject_dir: python/cudf_kafka
@@ -244,7 +244,7 @@ dependencies:
           - libarrow-all==14.0.1.*
           - librdkafka>=1.9.0,<1.10.0a0
           # Align nvcomp version with rapids-cmake
-          - nvcomp==2.6.1
+          - nvcomp==3.0.4
           - spdlog>=1.11.0,<1.12
   build_wheels:
     common:
@@ -256,19 +256,19 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - cython>=3.0.0
+          - cython>=3.0.3
           # TODO: Pin to numpy<1.25 until cudf requires pandas 2
           - &numpy numpy>=1.21,<1.25
+          - scikit-build>=0.13.1
       - output_types: [conda, requirements, pyproject]
         packages:
           # Hard pin the patch version used during the build. This must be kept
           # in sync with the version pinned in get_arrow.cmake.
           - pyarrow==14.0.1.*
-  build_python:
+  build_python_cudf:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - scikit-build>=0.13.1
           - rmm==24.2.*
       - output_types: conda
         packages:
@@ -302,9 +302,6 @@ dependencies:
               - cuda-nvrtc-dev
               - cuda-nvtx-dev
               - libcurand-dev
-              # TODO: Remove after https://github.com/rapidsai/cudf/pull/14292 updates
-              # cudf_kafka to use scikit-build
-              - cuda-gdb
           - matrix:
               cuda: "11.8"
             packages:
@@ -500,12 +497,10 @@ dependencies:
     common:
       - output_types: [conda, requirements, pyproject]
         packages:
-          - dask>=2023.9.2
-          - distributed>=2023.9.2
+          - rapids-dask-dependency==24.2.*
       - output_types: conda
         packages:
           - cupy>=12.0.0
-          - dask-core>=2023.9.2  # dask-core in conda is the actual package & dask is the meta package
       - output_types: pyproject
         packages:
           - &cudf cudf==24.2.*
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 1b543b94589..c041c7f4842 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -81,12 +81,6 @@ target_link_libraries(strings_udf cudf_strings_udf)
 # necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
 # /opt/_internal"
 find_package(NumPy REQUIRED)
-set(targets_using_numpy interop avro csv orc json parquet)
-foreach(target IN LISTS targets_using_numpy)
-  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
-  # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
-  # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
-endforeach()
 
 set(targets_using_dlpack interop)
 foreach(target IN LISTS targets_using_dlpack)
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index f751d73b142..0edf9f8aa95 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -24,7 +24,7 @@ from cudf.utils.dtypes import _get_base_dtype
 
 from cpython.buffer cimport PyObject_CheckBuffer
 from libc.stdint cimport uintptr_t
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -47,7 +47,6 @@ from cudf._lib.cpp.column.column_factories cimport (
     make_numeric_column,
 )
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport null_count as cpp_null_count
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index feaf75ef237..1ec4719631e 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
@@ -12,7 +12,6 @@ from cudf._lib.cpp.concatenate cimport (
     concatenate_masks as libcudf_concatenate_masks,
     concatenate_tables as libcudf_concatenate_tables,
 )
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.utils cimport (
     data_from_unique_ptr,
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index f57bc15ed57..ea6ee76c14a 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -24,12 +24,13 @@ from cudf._lib.utils cimport table_view_from_columns, table_view_from_table
 from cudf._lib.reduce import minmax
 from cudf.core.abc import Serializable
 
+from libcpp.functional cimport reference_wrapper
+from libcpp.memory cimport make_unique
+
 cimport cudf._lib.cpp.contiguous_split as cpp_contiguous_split
 cimport cudf._lib.cpp.copying as cpp_copying
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.lists.gather cimport (
     segmented_gather as cpp_segmented_gather,
 )
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index 20725c252fc..5637b55ac1c 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -2,6 +2,7 @@
 
 from libc.stdint cimport int32_t, int64_t, uint8_t
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.vector cimport vector
 
@@ -9,7 +10,6 @@ from rmm._lib.device_buffer cimport device_buffer
 
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
diff --git a/python/cudf/cudf/_lib/cpp/groupby.pxd b/python/cudf/cudf/_lib/cpp/groupby.pxd
index 2ecdf76842f..0266404fc50 100644
--- a/python/cudf/cudf/_lib/cpp/groupby.pxd
+++ b/python/cudf/cudf/_lib/cpp/groupby.pxd
@@ -1,6 +1,7 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2023, NVIDIA CORPORATION.
 
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.vector cimport vector
@@ -11,7 +12,6 @@ from cudf._lib.cpp.aggregation cimport (
 )
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index dd6f919a74d..d5ac8574fe4 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -4,12 +4,12 @@ from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index a6a7ba034aa..cdd1bde0274 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -2,16 +2,16 @@
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
+from libcpp.functional cimport reference_wrapper
 from libcpp.map cimport map
 from libcpp.memory cimport shared_ptr, unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 cimport cudf._lib.cpp.table.table_view as cudf_table_view
 from cudf._lib.cpp.expressions cimport expression
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.types cimport data_type, size_type
 
 
diff --git a/python/cudf/cudf/_lib/cpp/io/timezone.pxd b/python/cudf/cudf/_lib/cpp/io/timezone.pxd
index ba481d9a1d3..927c2118473 100644
--- a/python/cudf/cudf/_lib/cpp/io/timezone.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/timezone.pxd
@@ -2,9 +2,9 @@
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport optional
 from libcpp.string cimport string
 
-from cudf._lib.cpp.libcpp.optional cimport optional
 from cudf._lib.cpp.table.table cimport table
 
 
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd b/python/cudf/cudf/_lib/cpp/libcpp/__init__.pxd
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/__init__.py b/python/cudf/cudf/_lib/cpp/libcpp/__init__.py
deleted file mode 100644
index e69de29bb2d..00000000000
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
deleted file mode 100644
index f3e2d6d0878..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/functional.pxd
+++ /dev/null
@@ -1,7 +0,0 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-
-cdef extern from "<functional>" namespace "std" nogil:
-    cdef cppclass reference_wrapper[T]:
-        reference_wrapper()
-        reference_wrapper(T)
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd b/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
deleted file mode 100644
index 2178f1a940c..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/memory.pxd
+++ /dev/null
@@ -1,12 +0,0 @@
-# Copyright (c) 2023, NVIDIA CORPORATION.
-
-from libcpp.memory cimport unique_ptr
-
-
-cdef extern from "<memory>" namespace "std" nogil:
-    # The Cython standard header does not have except +, so C++
-    # exceptions from make_unique are not caught and translated to
-    # Python ones. This is not perfectly ergonomic, we always have to
-    # wrap make_unique in move, but at least we can catch exceptions.
-    # See https://github.com/cython/cython/issues/5560
-    unique_ptr[T] make_unique[T](...) except +
diff --git a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd b/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
deleted file mode 100644
index a78c18f3f7a..00000000000
--- a/python/cudf/cudf/_lib/cpp/libcpp/optional.pxd
+++ /dev/null
@@ -1,50 +0,0 @@
-# SPDX-FileCopyrightText: Copyright (c) 2023, NVIDIA CORPORATION &
-# AFFILIATES. All rights reserved.  SPDX-License-Identifier:
-# Apache-2.0
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-# http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from libcpp cimport bool
-
-
-cdef extern from "<optional>" namespace "std" nogil:
-    cdef cppclass nullopt_t:
-        nullopt_t()
-
-    cdef nullopt_t nullopt
-
-    cdef cppclass optional[T]:
-        ctypedef T value_type
-        optional()
-        optional(nullopt_t)
-        optional(optional&) except +
-        optional(T&) except +
-        bool has_value()
-        T& value()
-        T& value_or[U](U& default_value)
-        void swap(optional&)
-        void reset()
-        T& emplace(...)
-        T& operator*()
-        optional& operator=(optional&)
-        optional& operator=[U](U&)
-        bool operator bool()
-        bool operator!()
-        bool operator==[U](optional&, U&)
-        bool operator!=[U](optional&, U&)
-        bool operator<[U](optional&, U&)
-        bool operator>[U](optional&, U&)
-        bool operator<=[U](optional&, U&)
-        bool operator>=[U](optional&, U&)
-
-    optional[T] make_optional[T](...) except +
diff --git a/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
new file mode 100644
index 00000000000..e678e4e84db
--- /dev/null
+++ b/python/cudf/cudf/_lib/cpp/nvtext/byte_pair_encode.pxd
@@ -0,0 +1,24 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
+
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+
+
+cdef extern from "nvtext/byte_pair_encoding.hpp" namespace "nvtext" nogil:
+
+    cdef struct bpe_merge_pairs "nvtext::bpe_merge_pairs":
+        pass
+
+    cdef unique_ptr[bpe_merge_pairs] load_merge_pairs(
+        const column_view &merge_pairs
+    ) except +
+
+    cdef unique_ptr[column] byte_pair_encoding(
+        const column_view &strings,
+        const bpe_merge_pairs &merge_pairs,
+        const string_scalar &separator
+    ) except +
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index 8d7545ffe15..01a080f635f 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -4,12 +4,11 @@ from enum import Enum
 
 from cython.operator cimport dereference
 from libc.stdint cimport int64_t
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp cimport expressions as libcudf_exp
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.types cimport size_type
 
 # Necessary for proper casting, see below.
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index a26d820de6f..b3778e45cde 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -24,6 +24,8 @@ from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.scalar import as_device_scalar
 
+from libcpp.functional cimport reference_wrapper
+
 cimport cudf._lib.cpp.groupby as libcudf_groupby
 cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.aggregation cimport (
@@ -33,7 +35,6 @@ from cudf._lib.aggregation cimport (
     make_groupby_scan_aggregation,
 )
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table, table_view
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 416680aae24..378be978cc0 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -2,7 +2,7 @@
 
 from cudf.core.buffer import acquire_spill_lock
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
@@ -11,7 +11,6 @@ from rmm._lib.device_buffer cimport device_buffer
 cimport cudf._lib.cpp.join as cpp_join
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.utils cimport table_view_from_columns
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index 5b4538629f6..1f98140d9e4 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -6,13 +6,12 @@ from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from cudf.core.buffer import acquire_spill_lock, as_buffer
 
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
     bitmask_and as cpp_bitmask_and,
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index d4e2392ee04..d7cbdeb5bda 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -13,8 +13,8 @@
 # =============================================================================
 
 set(cython_sources
-    edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx ngrams_tokenize.pyx normalize.pyx
-    replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
+    byte_pair_encode.pyx edit_distance.pyx generate_ngrams.pyx jaccard.pyx minhash.pyx
+    ngrams_tokenize.pyx normalize.pyx replace.pyx stemmer.pyx subword_tokenize.pyx tokenize.pyx
 )
 set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
diff --git a/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
new file mode 100644
index 00000000000..cfc76afa8a5
--- /dev/null
+++ b/python/cudf/cudf/_lib/nvtext/byte_pair_encode.pyx
@@ -0,0 +1,50 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+
+from cudf.core.buffer import acquire_spill_lock
+
+from libcpp.memory cimport unique_ptr
+from libcpp.utility cimport move
+
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.nvtext.byte_pair_encode cimport (
+    bpe_merge_pairs as cpp_bpe_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+    load_merge_pairs as cpp_load_merge_pairs,
+)
+from cudf._lib.cpp.scalar.scalar cimport string_scalar
+from cudf._lib.scalar cimport DeviceScalar
+
+
+cdef class BPEMergePairs:
+    cdef unique_ptr[cpp_bpe_merge_pairs] c_obj
+
+    def __cinit__(self, Column merge_pairs):
+        cdef column_view c_pairs = merge_pairs.view()
+        with nogil:
+            self.c_obj = move(cpp_load_merge_pairs(c_pairs))
+
+
+@acquire_spill_lock()
+def byte_pair_encoding(
+    Column strings,
+    BPEMergePairs merge_pairs,
+    object separator
+):
+    cdef column_view c_strings = strings.view()
+    cdef DeviceScalar d_separator = separator.device_value
+    cdef const string_scalar* c_separator = <const string_scalar*>d_separator\
+        .get_raw_ptr()
+    cdef unique_ptr[column] c_result
+    with nogil:
+        c_result = move(
+            cpp_byte_pair_encoding(
+                c_strings,
+                merge_pairs.c_obj.get()[0],
+                c_separator[0]
+            )
+        )
+
+    return Column.from_unique_ptr(move(c_result))
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index d8d363686cc..4acb1ce10b1 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -32,7 +32,7 @@ from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.unordered_map cimport unordered_map
 from libcpp.utility cimport move
@@ -52,7 +52,6 @@ from cudf._lib.cpp.io.parquet cimport (
     write_parquet as parquet_writer,
 )
 from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
 from cudf._lib.io.datasource cimport NativeFileDatasource
diff --git a/python/cudf/cudf/_lib/timezone.pyx b/python/cudf/cudf/_lib/timezone.pyx
index 4d76cbfcdb5..808d1321b0b 100644
--- a/python/cudf/cudf/_lib/timezone.pyx
+++ b/python/cudf/cudf/_lib/timezone.pyx
@@ -1,13 +1,13 @@
 # Copyright (c) 2023, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.optional cimport make_optional
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.timezone cimport (
     make_timezone_transition_table as cpp_make_timezone_transition_table,
 )
-from cudf._lib.cpp.libcpp.optional cimport make_optional
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.utils cimport columns_from_unique_ptr
 
diff --git a/python/cudf/cudf/core/byte_pair_encoding.py b/python/cudf/cudf/core/byte_pair_encoding.py
new file mode 100644
index 00000000000..4c881022ecf
--- /dev/null
+++ b/python/cudf/cudf/core/byte_pair_encoding.py
@@ -0,0 +1,59 @@
+# Copyright (c) 2023, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import cudf
+from cudf._lib.nvtext.byte_pair_encode import (
+    BPEMergePairs as cpp_merge_pairs,
+    byte_pair_encoding as cpp_byte_pair_encoding,
+)
+
+
+class BytePairEncoder:
+    """
+    Given a merge pairs strings series, performs byte pair encoding on
+    a strings series using the provided separator.
+
+    Parameters
+    ----------
+    merges_pairs : str
+        Strings column of merge pairs
+
+    Returns
+    -------
+    BytePairEncoder
+    """
+
+    def __init__(self, merges_pair: "cudf.Series"):
+        self.merge_pairs = cpp_merge_pairs(merges_pair._column)
+
+    def __call__(self, text, separator: str = " "):
+        """
+
+        Parameters
+        ----------
+        text : cudf string series
+            The strings to be encoded.
+
+        Returns
+        -------
+        Encoded strings
+
+        Examples
+        --------
+        >>> import cudf
+        >>> from cudf.core.byte_pair_encoding import BytePairEncoder
+        >>> mps = cudf.Series(["e n", "i t", "i s", "e s", "en t",
+        ...                    "c e", "es t", "en ce", "T h", "Th is",
+        ...                    "t est", "s ent", "t h", "th is"])
+        >>> bpe = BytePairEncoder(mps)
+        >>> str_series = cudf.Series(['This is the sentence', 'thisisit'])
+        >>> bpe(str_series)
+        0    This is a sent ence
+        1             this is it
+        dtype: object
+        """
+        sep = cudf.Scalar(separator, dtype="str")
+        result = cpp_byte_pair_encoding(text._column, self.merge_pairs, sep)
+
+        return cudf.Series(result)
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index a5e99abd79e..b4f65693d85 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -2102,7 +2102,10 @@ def as_column(
     elif isinstance(arbitrary, (pd.Timestamp, pd.Timedelta)):
         # This will always treat NaTs as nulls since it's not technically a
         # discrete value like NaN
-        data = as_column(pa.array(pd.Series([arbitrary]), from_pandas=True))
+        length = length or 1
+        data = as_column(
+            pa.array(pd.Series([arbitrary] * length), from_pandas=True)
+        )
         if dtype is not None:
             data = data.astype(dtype)
 
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 376bef6d0b2..4211a8c24bf 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -2607,6 +2607,10 @@ def _reindex(
 
         df = self
         if index is not None:
+            if not df._index.is_unique:
+                raise ValueError(
+                    "cannot reindex on an axis with duplicate labels"
+                )
             index = cudf.core.index.as_index(
                 index, name=getattr(index, "name", self._index.name)
             )
diff --git a/python/cudf/cudf/pandas/module_accelerator.py b/python/cudf/cudf/pandas/module_accelerator.py
index eb35c4adaaf..180d75d96e8 100644
--- a/python/cudf/cudf/pandas/module_accelerator.py
+++ b/python/cudf/cudf/pandas/module_accelerator.py
@@ -10,6 +10,7 @@
 import importlib.abc
 import importlib.machinery
 import os
+import pathlib
 import sys
 import threading
 import warnings
@@ -554,9 +555,10 @@ def getattr_real_or_wrapped(
             frame = sys._getframe()
             # We cannot possibly be at the top level.
             assert frame.f_back
-            calling_module = frame.f_back.f_code.co_filename
+            calling_module = pathlib.PurePath(frame.f_back.f_code.co_filename)
             use_real = any(
-                calling_module.startswith(path) for path in loader._denylist
+                calling_module.is_relative_to(path)
+                for path in loader._denylist
             )
         try:
             if use_real:
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index db0446d506c..0546638f388 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -193,12 +193,15 @@ def test_column_mixed_dtype(data, error):
 
 
 @pytest.mark.parametrize("nan_as_null", [True, False])
-def test_as_column_scalar_with_nan(nan_as_null):
-    size = 10
-    scalar = np.nan
-
+@pytest.mark.parametrize(
+    "scalar",
+    [np.nan, pd.Timedelta(days=1), pd.Timestamp(2020, 1, 1)],
+    ids=repr,
+)
+@pytest.mark.parametrize("size", [1, 10])
+def test_as_column_scalar_with_nan(nan_as_null, scalar, size):
     expected = (
-        cudf.Series([np.nan] * size, nan_as_null=nan_as_null)
+        cudf.Series([scalar] * size, nan_as_null=nan_as_null)
         .dropna()
         .to_numpy()
     )
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d44cf594e8b..5677f97408a 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -10723,3 +10723,15 @@ def test_dataframe_series_dot():
     expected = gser @ [12, 13]
 
     assert_eq(expected, actual)
+
+
+def test_dataframe_duplicate_index_reindex():
+    gdf = cudf.DataFrame({"a": [0, 1, 2, 3]}, index=[0, 0, 1, 1])
+    pdf = gdf.to_pandas()
+
+    assert_exceptions_equal(
+        gdf.reindex,
+        pdf.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index 8f8f87c20e0..c15a797713f 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -2638,3 +2638,15 @@ def test_series_setitem_mixed_bool_dtype():
     s = cudf.Series([True, False, True])
     with pytest.raises(TypeError):
         s[0] = 10
+
+
+def test_series_duplicate_index_reindex():
+    gs = cudf.Series([0, 1, 2, 3], index=[0, 0, 1, 1])
+    ps = gs.to_pandas()
+
+    assert_exceptions_equal(
+        gs.reindex,
+        ps.reindex,
+        lfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+        rfunc_args_and_kwargs=([10, 11, 12, 13], {}),
+    )
diff --git a/python/cudf/cudf/tests/text/test_text_methods.py b/python/cudf/cudf/tests/text/test_text_methods.py
index e565df8f3da..2dccd583b23 100644
--- a/python/cudf/cudf/tests/text/test_text_methods.py
+++ b/python/cudf/cudf/tests/text/test_text_methods.py
@@ -7,6 +7,7 @@
 import pytest
 
 import cudf
+from cudf.core.byte_pair_encoding import BytePairEncoder
 from cudf.core.tokenize_vocabulary import TokenizeVocabulary
 from cudf.testing._utils import assert_eq
 
@@ -1024,3 +1025,43 @@ def test_jaccard_index_random_strings():
 
     actual = str1.str.jaccard_index(str2, jaccard_width)
     assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "separator, input, results",
+    [
+        (" ", "thetestsentence", "the test sent ence"),
+        ("_", "sentenceistest", "sent_ence_is_test"),
+        ("$", "istestsentencehere", "is$test$sent$ence$he$r$e"),
+    ],
+)
+def test_byte_pair_encoding(separator, input, results):
+    pairs_table = cudf.Series(
+        [
+            "t he",
+            "h e",
+            "e n",
+            "i t",
+            "i s",
+            "e s",
+            "en t",
+            "c e",
+            "es t",
+            "en ce",
+            "t h",
+            "h i",
+            "th is",
+            "t est",
+            "s i",
+            "s ent",
+        ]
+    )
+    encoder = BytePairEncoder(pairs_table)
+
+    strings = cudf.Series([input, None, "", input])
+
+    expected = cudf.Series([results, None, "", results])
+
+    actual = encoder(strings, separator)
+    assert type(expected) == type(actual)
+    assert_eq(expected, actual)
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 9dc006a86ba..d32284c0c5d 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -4,7 +4,7 @@
 build-backend = "setuptools.build_meta"
 requires = [
     "cmake>=3.26.4",
-    "cython>=3.0.0",
+    "cython>=3.0.3",
     "ninja",
     "numpy>=1.21,<1.25",
     "protoc-wheel",
diff --git a/python/cudf_kafka/CMakeLists.txt b/python/cudf_kafka/CMakeLists.txt
new file mode 100644
index 00000000000..1e21c873585
--- /dev/null
+++ b/python/cudf_kafka/CMakeLists.txt
@@ -0,0 +1,47 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.26.4 FATAL_ERROR)
+
+set(cudf_kafka_version 24.02.00)
+
+include(../../fetch_rapids.cmake)
+
+project(
+  cudf-kafka-python
+  VERSION ${cudf_kafka_version}
+  LANGUAGES # TODO: Building Python extension modules via the python_extension_module requires the C
+            # language to be enabled here. The test project that is built in scikit-build to verify
+            # various linking options for the python library is hardcoded to build with C, so until
+            # that is fixed we need to keep C.
+            C CXX
+)
+
+find_package(cudf_kafka ${cudf_kafka_version} REQUIRED)
+
+if(NOT cudf_kafka_FOUND)
+  message(
+    FATAL_ERROR
+      "cudf_kafka package not found. cudf_kafka C++ is required to build this Python package."
+  )
+endif()
+
+include(rapids-cython)
+rapids_cython_init()
+
+add_subdirectory(cudf_kafka/_lib)
+
+if(DEFINED cython_lib_dir)
+  rapids_cython_add_rpath_entries(TARGET cudf_kafka PATHS "${cython_lib_dir}")
+endif()
diff --git a/python/cudf_kafka/LICENSE b/python/cudf_kafka/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cudf_kafka/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cudf_kafka/README.md b/python/cudf_kafka/README.md
new file mode 120000
index 00000000000..fe840054137
--- /dev/null
+++ b/python/cudf_kafka/README.md
@@ -0,0 +1 @@
+../../README.md
\ No newline at end of file
diff --git a/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
new file mode 100644
index 00000000000..3262b7d5ebe
--- /dev/null
+++ b/python/cudf_kafka/cudf_kafka/_lib/CMakeLists.txt
@@ -0,0 +1,62 @@
+# =============================================================================
+# Copyright (c) 2022-2023, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+set(cython_sources kafka.pyx)
+set(linked_libraries cudf_kafka::cudf_kafka)
+
+rapids_cython_create_modules(
+  CXX ASSOCIATED_TARGETS cudf_kafka
+  SOURCE_FILES "${cython_sources}"
+  LINKED_LIBRARIES "${linked_libraries}"
+)
+
+# TODO: Finding NumPy currently requires finding Development due to a bug in CMake. This bug was
+# fixed in https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7410 and will be available in
+# CMake 3.24, so we can remove the Development component once we upgrade to CMake 3.24.
+# find_package(Python REQUIRED COMPONENTS Development NumPy)
+
+# Note: The bug noted above prevents us from finding NumPy successfully using FindPython.cmake
+# inside the manylinux images used to build wheels because manylinux images do not contain
+# libpython.so and therefore Development cannot be found. Until we upgrade to CMake 3.24, we should
+# use FindNumpy.cmake instead (provided by scikit-build). When we switch to 3.24 we can try
+# switching back, but it may not work if that implicitly still requires Python libraries. In that
+# case we'll need to follow up with the CMake team to remove that dependency.  The stopgap solution
+# is to unpack the static lib tarballs in the wheel building jobs so that there are at least static
+# libs to be found, but that should be a last resort since it implies a dependency that isn't really
+# necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
+# /opt/_internal"
+find_package(NumPy REQUIRED)
+
+find_package(Python 3.9 REQUIRED COMPONENTS Interpreter)
+
+execute_process(
+  COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_include())"
+  OUTPUT_VARIABLE PYARROW_INCLUDE_DIR
+  ERROR_VARIABLE PYARROW_ERROR
+  RESULT_VARIABLE PYARROW_RESULT
+  OUTPUT_STRIP_TRAILING_WHITESPACE
+)
+
+if(${PYARROW_RESULT})
+  message(FATAL_ERROR "Error while trying to obtain pyarrow include directory:\n${PYARROW_ERROR}")
+endif()
+
+# TODO: Due to cudf's scalar.pyx needing to cimport pylibcudf's scalar.pyx (because there are parts
+# of cudf Cython that need to directly access the c_obj underlying the pylibcudf Scalar) the
+# requirement for arrow headers infects all of cudf. That in turn requires including numpy headers.
+# These requirements will go away once all scalar-related Cython code is removed from cudf.
+foreach(target IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${PYARROW_INCLUDE_DIR}")
+endforeach()
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
index ca729c62512..068837d04ee 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pxd
@@ -11,12 +11,12 @@ from cudf._lib.cpp.io.datasource cimport datasource
 from cudf._lib.io.datasource cimport Datasource
 
 
-cdef extern from "kafka_callback.hpp" \
+cdef extern from "cudf_kafka/kafka_callback.hpp" \
         namespace "cudf::io::external::kafka" nogil:
     ctypedef object (*python_callable_type)()
 
 
-cdef extern from "kafka_consumer.hpp" \
+cdef extern from "cudf_kafka/kafka_consumer.hpp" \
         namespace "cudf::io::external::kafka" nogil:
 
     cpdef cppclass kafka_consumer:
diff --git a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
index 4d732478723..2fbaacff7c6 100644
--- a/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
+++ b/python/cudf_kafka/cudf_kafka/_lib/kafka.pyx
@@ -3,12 +3,11 @@
 from libc.stdint cimport int32_t, int64_t
 from libcpp cimport bool, nullptr
 from libcpp.map cimport map
-from libcpp.memory cimport unique_ptr
+from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.cpp.io.datasource cimport datasource
-from cudf._lib.cpp.libcpp.memory cimport make_unique
 
 from cudf_kafka._lib.kafka cimport kafka_consumer
 
diff --git a/python/cudf_kafka/pyproject.toml b/python/cudf_kafka/pyproject.toml
index 642c9ffbf43..05b1db3b9ac 100644
--- a/python/cudf_kafka/pyproject.toml
+++ b/python/cudf_kafka/pyproject.toml
@@ -3,9 +3,10 @@
 [build-system]
 
 requires = [
-    "cython>=3.0.0",
+    "cython>=3.0.3",
     "numpy>=1.21,<1.25",
     "pyarrow==14.0.1.*",
+    "scikit-build>=0.13.1",
     "setuptools",
     "wheel",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
diff --git a/python/cudf_kafka/setup.py b/python/cudf_kafka/setup.py
index 6f3909d4528..6a99e9ed968 100644
--- a/python/cudf_kafka/setup.py
+++ b/python/cudf_kafka/setup.py
@@ -1,96 +1,13 @@
-# Copyright (c) 2020-2023, NVIDIA CORPORATION.
-import os
-import shutil
-import sysconfig
-from distutils.sysconfig import get_python_lib
-
-import numpy as np
-import pyarrow as pa
-from Cython.Build import cythonize
-from setuptools import find_packages, setup
-from setuptools.extension import Extension
-
-cython_files = ["cudf_kafka/_lib/*.pyx"]
-
-CUDA_HOME = os.environ.get("CUDA_HOME", False)
-if not CUDA_HOME:
-    path_to_cuda_gdb = shutil.which("cuda-gdb")
-    if path_to_cuda_gdb is None:
-        raise OSError(
-            "Could not locate CUDA. "
-            "Please set the environment variable "
-            "CUDA_HOME to the path to the CUDA installation "
-            "and try again."
-        )
-    CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb))
-
-if not os.path.isdir(CUDA_HOME):
-    raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}")
-
-cuda_include_dir = os.path.join(CUDA_HOME, "include")
-
-CUDF_ROOT = os.environ.get(
-    "CUDF_ROOT",
-    os.path.abspath(
-        os.path.join(
-            os.path.dirname(os.path.abspath(__file__)), "../../cpp/build/"
-        )
-    ),
-)
-CUDF_KAFKA_ROOT = os.environ.get(
-    "CUDF_KAFKA_ROOT", "../../cpp/libcudf_kafka/build"
-)
-
-try:
-    nthreads = int(os.environ.get("PARALLEL_LEVEL", "0") or "0")
-except Exception:
-    nthreads = 0
-
-extensions = [
-    Extension(
-        "*",
-        sources=cython_files,
-        include_dirs=[
-            os.path.abspath(os.path.join(CUDF_ROOT, "../include/cudf")),
-            os.path.abspath(os.path.join(CUDF_ROOT, "../include")),
-            os.path.abspath(
-                os.path.join(CUDF_ROOT, "../libcudf_kafka/include/cudf_kafka")
-            ),
-            os.path.join(CUDF_ROOT, "include"),
-            os.path.join(CUDF_ROOT, "_deps/libcudacxx-src/include"),
-            os.path.join(
-                os.path.dirname(sysconfig.get_path("include")),
-                "rapids/libcudacxx",
-            ),
-            os.path.dirname(sysconfig.get_path("include")),
-            np.get_include(),
-            pa.get_include(),
-            cuda_include_dir,
-        ],
-        library_dirs=(
-            [
-                get_python_lib(),
-                os.path.join(os.sys.prefix, "lib"),
-                CUDF_KAFKA_ROOT,
-            ]
-        ),
-        libraries=["cudf", "cudf_kafka"],
-        language="c++",
-        extra_compile_args=["-std=c++17", "-DFMT_HEADER_ONLY=1"],
-    )
-]
+# Copyright (c) 2018-2023, NVIDIA CORPORATION.
+from setuptools import find_packages
+from skbuild import setup
 
 packages = find_packages(include=["cudf_kafka*"])
+
 setup(
-    # Include the separately-compiled shared library
-    ext_modules=cythonize(
-        extensions,
-        nthreads=nthreads,
-        compiler_directives=dict(
-            profile=False, language_level=3, embedsignature=True
-        ),
-    ),
     packages=packages,
-    package_data={key: ["VERSION", "*.pxd"] for key in packages},
+    package_data={
+        key: ["VERSION", "*.pxd", "*.hpp", "*.cuh"] for key in packages
+    },
     zip_safe=False,
 )
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index b0da82eaeee..387643587d1 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -427,17 +427,12 @@ def union_categoricals_cudf(
     )
 
 
-@_dask_cudf_nvtx_annotate
-def safe_hash(frame):
-    return cudf.Series(frame.hash_values(), index=frame.index)
-
-
 @hash_object_dispatch.register((cudf.DataFrame, cudf.Series))
 @_dask_cudf_nvtx_annotate
 def hash_object_cudf(frame, index=True):
     if index:
-        return safe_hash(frame.reset_index())
-    return safe_hash(frame)
+        frame = frame.reset_index()
+    return frame.hash_values()
 
 
 @hash_object_dispatch.register(cudf.BaseIndex)
@@ -445,10 +440,10 @@ def hash_object_cudf(frame, index=True):
 def hash_object_cudf_index(ind, index=None):
 
     if isinstance(ind, cudf.MultiIndex):
-        return safe_hash(ind.to_frame(index=False))
+        return ind.to_frame(index=False).hash_values()
 
     col = cudf.core.column.as_column(ind)
-    return safe_hash(cudf.Series(col))
+    return cudf.Series(col).hash_values()
 
 
 @group_split_dispatch.register((cudf.Series, cudf.DataFrame))
diff --git a/python/dask_cudf/pyproject.toml b/python/dask_cudf/pyproject.toml
index 12d98e57545..890be46b974 100644
--- a/python/dask_cudf/pyproject.toml
+++ b/python/dask_cudf/pyproject.toml
@@ -20,11 +20,10 @@ requires-python = ">=3.9"
 dependencies = [
     "cudf==24.2.*",
     "cupy-cuda11x>=12.0.0",
-    "dask>=2023.9.2",
-    "distributed>=2023.9.2",
     "fsspec>=0.6.0",
     "numpy>=1.21,<1.25",
     "pandas>=1.3,<1.6.0dev0",
+    "rapids-dask-dependency==24.2.*",
 ] # This list was generated by `rapids-dependency-file-generator`. To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
 classifiers = [
     "Intended Audience :: Developers",