diff --git a/.github/labeler.yml b/.github/labeler.yml
index 41fd3802d55..3a868ac7d45 100644
--- a/.github/labeler.yml
+++ b/.github/labeler.yml
@@ -3,14 +3,14 @@
 cuDF (Python):
   - 'python/**'
   - 'notebooks/**'
-  
+
 libcudf:
   - 'cpp/**'
 
 CMake:
   - '**/CMakeLists.txt'
   - '**/cmake/**'
-  
+
 cuDF (Java):
   - 'java/**'
 
diff --git a/.github/workflows/add_to_project.yml b/.github/workflows/add_to_project.yml
new file mode 100644
index 00000000000..b301c56a999
--- /dev/null
+++ b/.github/workflows/add_to_project.yml
@@ -0,0 +1,20 @@
+name: Add new issue/PR to project
+
+on:
+  issues:
+    types:
+      - opened
+
+  pull_request_target:
+    types:
+      - opened
+
+jobs:
+  add-to-project:
+    name: Add issue or PR to project
+    runs-on: ubuntu-latest
+    steps:
+      - uses: actions/add-to-project@v0.3.0
+        with:
+          project-url: https://github.com/orgs/rapidsai/projects/51
+          github-token: ${{ secrets.ADD_TO_PROJECT_GITHUB_TOKEN }}
diff --git a/.github/workflows/dependency-files.yml b/.github/workflows/dependency-files.yml
new file mode 100644
index 00000000000..2ae939292d7
--- /dev/null
+++ b/.github/workflows/dependency-files.yml
@@ -0,0 +1,12 @@
+name: pr
+
+on:
+  pull_request:
+
+jobs:
+  checks:
+    secrets: inherit
+    uses: rapidsai/shared-action-workflows/.github/workflows/checks.yaml@main
+    with:
+      enable_check_size: false
+      enable_check_style: false
diff --git a/.github/workflows/stale.yaml b/.github/workflows/stale.yaml
deleted file mode 100644
index 741e159fbd8..00000000000
--- a/.github/workflows/stale.yaml
+++ /dev/null
@@ -1,57 +0,0 @@
-name: Mark inactive issues and pull requests
-
-on:
-  schedule:
-    - cron: "0 * * * *"
-
-jobs:
-  mark-inactive-30d:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Mark 30 day inactive issues
-        uses: actions/stale@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          stale-issue-message: >
-            This issue has been labeled `inactive-30d` due to no recent activity in the past 30 days.
-            Please close this issue if no further response or action is needed.
-            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-            This issue will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-issue-label: "inactive-30d"
-          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 30
-          days-before-issue-close: -1
-          stale-pr-message: >
-            This PR has been labeled `inactive-30d` due to no recent activity in the past 30 days.
-            Please close this PR if it is no longer required.
-            Otherwise, please respond with a comment indicating any updates.
-            This PR will be labeled `inactive-90d` if there is no activity in the next 60 days.
-          stale-pr-label: "inactive-30d"
-          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 30
-          days-before-pr-close: -1
-          operations-per-run: 50
-  mark-inactive-90d:
-    runs-on: ubuntu-latest
-    steps:
-      - name: Mark 90 day inactive issues
-        uses: actions/stale@v3
-        with:
-          repo-token: ${{ secrets.GITHUB_TOKEN }}
-          stale-issue-message: >
-            This issue has been labeled `inactive-90d` due to no recent activity in the past 90 days.
-            Please close this issue if no further response or action is needed.
-            Otherwise, please respond with a comment indicating any updates or changes to the original issue and/or confirm this issue still needs to be addressed.
-          stale-issue-label: "inactive-90d"
-          exempt-issue-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-issue-stale: 90
-          days-before-issue-close: -1
-          stale-pr-message: >
-            This PR has been labeled `inactive-90d` due to no recent activity in the past 90 days.
-            Please close this PR if it is no longer required.
-            Otherwise, please respond with a comment indicating any updates.
-          stale-pr-label: "inactive-90d"
-          exempt-pr-labels: "0 - Blocked,0 - Backlog,good first issue"
-          days-before-pr-stale: 90
-          days-before-pr-close: -1
-          operations-per-run: 50
diff --git a/.github/workflows/wheels.yml b/.github/workflows/wheels.yml
new file mode 100644
index 00000000000..7f1c708c9a7
--- /dev/null
+++ b/.github/workflows/wheels.yml
@@ -0,0 +1,77 @@
+name: cuDF wheels
+
+on:
+  workflow_call:
+    inputs:
+      versioneer-override:
+        type: string
+        default: ''
+      build-tag:
+        type: string
+        default: ''
+      branch:
+        required: true
+        type: string
+      date:
+        required: true
+        type: string
+      sha:
+        required: true
+        type: string
+      build-type:
+        type: string
+        default: nightly
+
+concurrency:
+  group: "cudf-${{ github.workflow }}-${{ github.ref }}"
+  cancel-in-progress: true
+
+jobs:
+  cudf-wheels:
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-manylinux.yml@main
+    with:
+      repo: rapidsai/cudf
+
+      build-type: ${{ inputs.build-type }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+
+      package-dir: python/cudf
+      package-name: cudf
+
+      python-package-versioneer-override: ${{ inputs.versioneer-override }}
+      python-package-build-tag: ${{ inputs.build-tag }}
+
+      skbuild-configure-options: "-DCUDF_BUILD_WHEELS=ON -DDETECT_CONDA_ENV=OFF"
+
+      test-extras:  test
+
+      # Have to manually specify the cupy install location on arm.
+      # Have to also manually install tokenizers==0.10.2, which is the last tokenizers
+      # to have a binary aarch64 wheel available on PyPI
+      # Otherwise, the tokenizers sdist is used, which needs a Rust compiler
+      test-before-arm64: "pip install tokenizers==0.10.2 cupy-cuda11x -f https://pip.cupy.dev/aarch64"
+
+      test-unittest: "pytest -v -n 8 ./python/cudf/cudf/tests"
+    secrets: inherit
+  dask_cudf-wheel:
+    needs: cudf-wheels
+    uses: rapidsai/shared-action-workflows/.github/workflows/wheels-pure.yml@main
+    with:
+      repo: rapidsai/cudf
+
+      build-type: ${{ inputs.build-type }}
+      branch: ${{ inputs.branch }}
+      sha: ${{ inputs.sha }}
+      date: ${{ inputs.date }}
+
+      package-dir: python/dask_cudf
+      package-name: dask_cudf
+
+      python-package-versioneer-override: ${{ inputs.versioneer-override }}
+      python-package-build-tag: ${{ inputs.build-tag }}
+
+      test-extras:  test
+      test-unittest: "pytest -v -n 8 ./python/dask_cudf/dask_cudf/tests"
+    secrets: inherit
diff --git a/.gitignore b/.gitignore
index 0d63c76bf9f..1867e65b7be 100644
--- a/.gitignore
+++ b/.gitignore
@@ -70,7 +70,6 @@ junit-cudf.xml
 test-results
 
 ## Patching
-*.diff
 *.orig
 *.rej
 
@@ -166,3 +165,8 @@ dask-worker-space/
 # Sphinx docs & build artifacts
 docs/cudf/source/api_docs/generated/*
 docs/cudf/source/api_docs/api/*
+docs/cudf/source/user_guide/example_output/*
+docs/cudf/source/user_guide/cudf.*Dtype.*.rst
+
+# cibuildwheel
+/wheelhouse
diff --git a/.pre-commit-config.yaml b/.pre-commit-config.yaml
index 1046f4ebe6f..75d285f4f54 100644
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -1,6 +1,19 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 repos:
+      - repo: https://github.com/pre-commit/pre-commit-hooks
+        rev: v4.3.0
+        hooks:
+              - id: trailing-whitespace
+                exclude: |
+                  (?x)^(
+                    ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
+                  )
+              - id: end-of-file-fixer
+                exclude: |
+                  (?x)^(
+                    ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*
+                  )
       - repo: https://github.com/PyCQA/isort
         rev: 5.10.1
         hooks:
@@ -18,12 +31,18 @@ repos:
                 # Explicitly specify the pyproject.toml at the repo root, not per-project.
                 args: ["--config", "pyproject.toml"]
       - repo: https://github.com/PyCQA/flake8
-        rev: 3.8.3
+        rev: 5.0.4
         hooks:
               - id: flake8
                 args: ["--config=setup.cfg"]
-                files: python/.*\.(py|pyx|pxd)$
+                files: python/.*$
                 types: [file]
+                types_or: [python, cython]
+                additional_dependencies: ["flake8-force"]
+      - repo: https://github.com/MarcoGorelli/cython-lint
+        rev: v0.1.10
+        hooks:
+              - id: cython-lint
       - repo: https://github.com/pre-commit/mirrors-mypy
         rev: 'v0.971'
         hooks:
@@ -46,6 +65,16 @@ repos:
               - id: clang-format
                 types_or: [c, c++, cuda]
                 args: ["-fallback-style=none", "-style=file", "-i"]
+      - repo: https://github.com/sirosen/texthooks
+        rev: 0.4.0
+        hooks:
+              - id: fix-smartquotes
+                exclude: |
+                  (?x)^(
+                    ^cpp/include/cudf_test/cxxopts.hpp|
+                    ^python/cudf/cudf/tests/data/subword_tokenizer_data/.*|
+                    ^python/cudf/cudf/tests/test_text.py
+                  )
       - repo: local
         hooks:
               - id: no-deprecationwarning
diff --git a/CHANGELOG.md b/CHANGELOG.md
index d06545a94b5..5c7f1d0db28 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,233 @@
+# cuDF 22.12.00 (8 Dec 2022)
+
+## 🚨 Breaking Changes
+
+- Add JNI for `substring` without &#39;end&#39; parameter. ([#12113](https://github.com/rapidsai/cudf/pull/12113)) [@firestarman](https://github.com/firestarman)
+- Refactor `purge_nonempty_nulls` ([#12111](https://github.com/rapidsai/cudf/pull/12111)) [@ttnghia](https://github.com/ttnghia)
+- Create an `int8` column in `read_csv` when all elements are missing ([#12110](https://github.com/rapidsai/cudf/pull/12110)) [@vuule](https://github.com/vuule)
+- Throw an error when libcudf is built without cuFile and `LIBCUDF_CUFILE_POLICY` is set to `&quot;ALWAYS&quot;` ([#12080](https://github.com/rapidsai/cudf/pull/12080)) [@vuule](https://github.com/vuule)
+- Fix type promotion edge cases in numerical binops ([#12074](https://github.com/rapidsai/cudf/pull/12074)) [@wence-](https://github.com/wence-)
+- Reduce/Remove reliance on `**kwargs` and `*args` in `IO` readers &amp; writers ([#12025](https://github.com/rapidsai/cudf/pull/12025)) [@galipremsagar](https://github.com/galipremsagar)
+- Rollback of `DeviceBufferLike` ([#12009](https://github.com/rapidsai/cudf/pull/12009)) [@madsbk](https://github.com/madsbk)
+- Remove unused `managed_allocator` ([#12005](https://github.com/rapidsai/cudf/pull/12005)) [@vyasr](https://github.com/vyasr)
+- Pass column names to `write_csv` instead of `table_metadata` pointer ([#11972](https://github.com/rapidsai/cudf/pull/11972)) [@vuule](https://github.com/vuule)
+- Accept const refs instead of const unique_ptr refs in reduce and scan APIs. ([#11960](https://github.com/rapidsai/cudf/pull/11960)) [@vyasr](https://github.com/vyasr)
+- Default to equal NaNs in make_merge_sets_aggregation. ([#11952](https://github.com/rapidsai/cudf/pull/11952)) [@bdice](https://github.com/bdice)
+- Remove validation that requires introspection ([#11938](https://github.com/rapidsai/cudf/pull/11938)) [@vyasr](https://github.com/vyasr)
+- Trim quotes for non-string values in nested json parsing ([#11898](https://github.com/rapidsai/cudf/pull/11898)) [@karthikeyann](https://github.com/karthikeyann)
+- Add tests ensuring that cudf&#39;s default stream is always used ([#11875](https://github.com/rapidsai/cudf/pull/11875)) [@vyasr](https://github.com/vyasr)
+- Support nested types as groupby keys in libcudf ([#11792](https://github.com/rapidsai/cudf/pull/11792)) [@PointKernel](https://github.com/PointKernel)
+- Default to equal NaNs in make_collect_set_aggregation. ([#11621](https://github.com/rapidsai/cudf/pull/11621)) [@bdice](https://github.com/bdice)
+- Removing int8 column option from parquet byte_array writing ([#11539](https://github.com/rapidsai/cudf/pull/11539)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- part1: Simplify BaseIndex to an abstract class ([#10389](https://github.com/rapidsai/cudf/pull/10389)) [@skirui-source](https://github.com/skirui-source)
+
+## 🐛 Bug Fixes
+
+- Fix include line for IO Cython modules ([#12250](https://github.com/rapidsai/cudf/pull/12250)) [@vyasr](https://github.com/vyasr)
+- Make dask pinning looser ([#12231](https://github.com/rapidsai/cudf/pull/12231)) [@vyasr](https://github.com/vyasr)
+- Workaround for CUB segmented-sort bug with boolean keys ([#12217](https://github.com/rapidsai/cudf/pull/12217)) [@davidwendt](https://github.com/davidwendt)
+- Fix `from_dict` backend dispatch to match upstream `dask` ([#12203](https://github.com/rapidsai/cudf/pull/12203)) [@galipremsagar](https://github.com/galipremsagar)
+- Merge branch-22.10 into branch-22.12 ([#12198](https://github.com/rapidsai/cudf/pull/12198)) [@davidwendt](https://github.com/davidwendt)
+- Fix compression in ORC writer ([#12194](https://github.com/rapidsai/cudf/pull/12194)) [@vuule](https://github.com/vuule)
+- Don&#39;t use CMake 3.25.0 as it has a show stopping FindCUDAToolkit bug ([#12188](https://github.com/rapidsai/cudf/pull/12188)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix data corruption when reading ORC files with empty stripes ([#12160](https://github.com/rapidsai/cudf/pull/12160)) [@vuule](https://github.com/vuule)
+- Fix decimal binary operations ([#12142](https://github.com/rapidsai/cudf/pull/12142)) [@galipremsagar](https://github.com/galipremsagar)
+- Ensure dlpack include is provided to cudf interop lib ([#12139](https://github.com/rapidsai/cudf/pull/12139)) [@robertmaynard](https://github.com/robertmaynard)
+- Safely allocate `udf_string` pointers in `strings_udf` ([#12138](https://github.com/rapidsai/cudf/pull/12138)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Fix/disable jitify lto ([#12122](https://github.com/rapidsai/cudf/pull/12122)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix conditional_full_join benchmark ([#12121](https://github.com/rapidsai/cudf/pull/12121)) [@GregoryKimball](https://github.com/GregoryKimball)
+- Fix regex working-memory-size refactor error ([#12119](https://github.com/rapidsai/cudf/pull/12119)) [@davidwendt](https://github.com/davidwendt)
+- Add in negative size checks for columns ([#12118](https://github.com/rapidsai/cudf/pull/12118)) [@revans2](https://github.com/revans2)
+- Add JNI for `substring` without &#39;end&#39; parameter. ([#12113](https://github.com/rapidsai/cudf/pull/12113)) [@firestarman](https://github.com/firestarman)
+- Fix reading of CSV files with blank second row ([#12098](https://github.com/rapidsai/cudf/pull/12098)) [@vuule](https://github.com/vuule)
+- Fix an error in IO with `GzipFile` type ([#12085](https://github.com/rapidsai/cudf/pull/12085)) [@galipremsagar](https://github.com/galipremsagar)
+- Workaround groupby aggregate thrust::copy_if overflow ([#12079](https://github.com/rapidsai/cudf/pull/12079)) [@davidwendt](https://github.com/davidwendt)
+- Fix alignment of compressed blocks in ORC writer ([#12077](https://github.com/rapidsai/cudf/pull/12077)) [@vuule](https://github.com/vuule)
+- Fix singleton-range `__setitem__` edge case ([#12075](https://github.com/rapidsai/cudf/pull/12075)) [@wence-](https://github.com/wence-)
+- Fix type promotion edge cases in numerical binops ([#12074](https://github.com/rapidsai/cudf/pull/12074)) [@wence-](https://github.com/wence-)
+- Force using old fmt in nvbench. ([#12067](https://github.com/rapidsai/cudf/pull/12067)) [@vyasr](https://github.com/vyasr)
+- Fixes List offset bug in Nested JSON reader ([#12060](https://github.com/rapidsai/cudf/pull/12060)) [@karthikeyann](https://github.com/karthikeyann)
+- Allow falling back to `shim_60.ptx` by default in `strings_udf` ([#12056](https://github.com/rapidsai/cudf/pull/12056)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Force black exclusions for pre-commit. ([#12036](https://github.com/rapidsai/cudf/pull/12036)) [@bdice](https://github.com/bdice)
+- Add `memory_usage` &amp; `items` implementation for `Struct` column &amp; dtype ([#12033](https://github.com/rapidsai/cudf/pull/12033)) [@galipremsagar](https://github.com/galipremsagar)
+- Reduce/Remove reliance on `**kwargs` and `*args` in `IO` readers &amp; writers ([#12025](https://github.com/rapidsai/cudf/pull/12025)) [@galipremsagar](https://github.com/galipremsagar)
+- Fixes bug in csv_reader_options construction in cython ([#12021](https://github.com/rapidsai/cudf/pull/12021)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix issues when both `usecols` and `names` options are used in `read_csv` ([#12018](https://github.com/rapidsai/cudf/pull/12018)) [@vuule](https://github.com/vuule)
+- Port thrust&#39;s pinned_allocator to cudf, since Thrust 1.17 removes the type ([#12004](https://github.com/rapidsai/cudf/pull/12004)) [@robertmaynard](https://github.com/robertmaynard)
+- Revert &quot;Replace most of preprocessor usage in nvcomp adapter with `constexpr`&quot; ([#11999](https://github.com/rapidsai/cudf/pull/11999)) [@vuule](https://github.com/vuule)
+- Fix bug where `df.loc` resulting in single row could give wrong index ([#11998](https://github.com/rapidsai/cudf/pull/11998)) [@eriknw](https://github.com/eriknw)
+- Switch to DISABLE_DEPRECATION_WARNINGS to match other RAPIDS projects ([#11989](https://github.com/rapidsai/cudf/pull/11989)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix maximum page size estimate in Parquet writer ([#11962](https://github.com/rapidsai/cudf/pull/11962)) [@vuule](https://github.com/vuule)
+- Fix local offset handling in bgzip reader ([#11918](https://github.com/rapidsai/cudf/pull/11918)) [@upsj](https://github.com/upsj)
+- Fix an issue reading struct-of-list types in Parquet. ([#11910](https://github.com/rapidsai/cudf/pull/11910)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Fix memcheck error in TypeInference.Timestamp gtest ([#11905](https://github.com/rapidsai/cudf/pull/11905)) [@davidwendt](https://github.com/davidwendt)
+- Fix type casting in Series.__setitem__ ([#11904](https://github.com/rapidsai/cudf/pull/11904)) [@wence-](https://github.com/wence-)
+- Fix memcheck error in get_dremel_data ([#11903](https://github.com/rapidsai/cudf/pull/11903)) [@davidwendt](https://github.com/davidwendt)
+- Fixes Unsupported column type error due to empty list columns in Nested JSON reader ([#11897](https://github.com/rapidsai/cudf/pull/11897)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix segmented-sort to ignore indices outside the offsets ([#11888](https://github.com/rapidsai/cudf/pull/11888)) [@davidwendt](https://github.com/davidwendt)
+- Fix cudf::stable_sorted_order for NaN and -NaN in FLOAT64 columns ([#11874](https://github.com/rapidsai/cudf/pull/11874)) [@davidwendt](https://github.com/davidwendt)
+- Fix writing of Parquet files with many fragments ([#11869](https://github.com/rapidsai/cudf/pull/11869)) [@etseidl](https://github.com/etseidl)
+- Fix RangeIndex unary operators. ([#11868](https://github.com/rapidsai/cudf/pull/11868)) [@vyasr](https://github.com/vyasr)
+- JNI Avoid NPE for reading host binary data ([#11865](https://github.com/rapidsai/cudf/pull/11865)) [@revans2](https://github.com/revans2)
+- Fix decimal benchmark input data generation ([#11863](https://github.com/rapidsai/cudf/pull/11863)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix pre-commit copyright check ([#11860](https://github.com/rapidsai/cudf/pull/11860)) [@galipremsagar](https://github.com/galipremsagar)
+- Fix Parquet support for seconds and milliseconds duration types ([#11854](https://github.com/rapidsai/cudf/pull/11854)) [@vuule](https://github.com/vuule)
+- Ensure better compiler cache results between cudf cal-ver branches ([#11835](https://github.com/rapidsai/cudf/pull/11835)) [@robertmaynard](https://github.com/robertmaynard)
+- Fix make_column_from_scalar for all-null strings column ([#11807](https://github.com/rapidsai/cudf/pull/11807)) [@davidwendt](https://github.com/davidwendt)
+- Tell jitify_preprocess where to search for libnvrtc ([#11787](https://github.com/rapidsai/cudf/pull/11787)) [@robertmaynard](https://github.com/robertmaynard)
+- add V2 page header support to parquet reader ([#11778](https://github.com/rapidsai/cudf/pull/11778)) [@etseidl](https://github.com/etseidl)
+- Parquet reader: bug fix for a num_rows/skip_rows corner case, w/optimization for nested preprocessing ([#11752](https://github.com/rapidsai/cudf/pull/11752)) [@nvdbaranec](https://github.com/nvdbaranec)
+- Determine if Arrow has S3 support at runtime in unit test. ([#11560](https://github.com/rapidsai/cudf/pull/11560)) [@bdice](https://github.com/bdice)
+
+## 📖 Documentation
+
+- Use rapidsai CODE_OF_CONDUCT.md ([#12166](https://github.com/rapidsai/cudf/pull/12166)) [@bdice](https://github.com/bdice)
+- Add symlinks to notebooks. ([#12128](https://github.com/rapidsai/cudf/pull/12128)) [@bdice](https://github.com/bdice)
+- Add `truncate` API to python doc pages ([#12109](https://github.com/rapidsai/cudf/pull/12109)) [@galipremsagar](https://github.com/galipremsagar)
+- Update Numba docs links. ([#12107](https://github.com/rapidsai/cudf/pull/12107)) [@bdice](https://github.com/bdice)
+- Remove &quot;Multi-GPU with Dask-cuDF&quot; notebook. ([#12095](https://github.com/rapidsai/cudf/pull/12095)) [@bdice](https://github.com/bdice)
+- Fix link to c++ developer guide from `CONTRIBUTING.md` ([#12084](https://github.com/rapidsai/cudf/pull/12084)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add pivot_table and crosstab to docs. ([#12014](https://github.com/rapidsai/cudf/pull/12014)) [@bdice](https://github.com/bdice)
+- Fix doxygen text for cudf::dictionary::encode ([#11991](https://github.com/rapidsai/cudf/pull/11991)) [@davidwendt](https://github.com/davidwendt)
+- Replace default_stream_value with get_default_stream in docs. ([#11985](https://github.com/rapidsai/cudf/pull/11985)) [@vyasr](https://github.com/vyasr)
+- Add dtype docs pages and docstrings for `cudf` specific dtypes ([#11974](https://github.com/rapidsai/cudf/pull/11974)) [@galipremsagar](https://github.com/galipremsagar)
+- Update Unit Testing in libcudf guidelines to code tests outside the cudf::test namespace ([#11959](https://github.com/rapidsai/cudf/pull/11959)) [@davidwendt](https://github.com/davidwendt)
+- Rename libcudf++ to libcudf. ([#11953](https://github.com/rapidsai/cudf/pull/11953)) [@bdice](https://github.com/bdice)
+- Fix documentation referring to removed as_gpu_matrix method. ([#11937](https://github.com/rapidsai/cudf/pull/11937)) [@bdice](https://github.com/bdice)
+- Remove &quot;experimental&quot; warning for struct columns in ORC reader and writer ([#11880](https://github.com/rapidsai/cudf/pull/11880)) [@vuule](https://github.com/vuule)
+- Initial draft of policies and guidelines for libcudf usage. ([#11853](https://github.com/rapidsai/cudf/pull/11853)) [@vyasr](https://github.com/vyasr)
+- Add clear indication of non-GPU accelerated parameters in read_json docstring ([#11825](https://github.com/rapidsai/cudf/pull/11825)) [@GregoryKimball](https://github.com/GregoryKimball)
+- Add developer docs for writing tests ([#11199](https://github.com/rapidsai/cudf/pull/11199)) [@vyasr](https://github.com/vyasr)
+
+## 🚀 New Features
+
+- Adds an EventHandler to Java MemoryBuffer to be invoked on close ([#12125](https://github.com/rapidsai/cudf/pull/12125)) [@abellina](https://github.com/abellina)
+- Support `+` in `strings_udf` ([#12117](https://github.com/rapidsai/cudf/pull/12117)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Support `upper` and `lower` in `strings_udf` ([#12099](https://github.com/rapidsai/cudf/pull/12099)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Add wheel builds ([#12096](https://github.com/rapidsai/cudf/pull/12096)) [@vyasr](https://github.com/vyasr)
+- Allow setting malloc heap size in string udfs ([#12094](https://github.com/rapidsai/cudf/pull/12094)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Support `strip`, `lstrip`, and `rstrip` in `strings_udf` ([#12091](https://github.com/rapidsai/cudf/pull/12091)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Mark nvcomp zstd compression stable ([#12059](https://github.com/rapidsai/cudf/pull/12059)) [@jbrennan333](https://github.com/jbrennan333)
+- Add debug-only onAllocated/onDeallocated to RmmEventHandler ([#12054](https://github.com/rapidsai/cudf/pull/12054)) [@abellina](https://github.com/abellina)
+- Enable building against the libarrow contained in pyarrow ([#12034](https://github.com/rapidsai/cudf/pull/12034)) [@vyasr](https://github.com/vyasr)
+- Add strings `like` jni and native method ([#12032](https://github.com/rapidsai/cudf/pull/12032)) [@cindyyuanjiang](https://github.com/cindyyuanjiang)
+- Cleanup common parsing code in JSON, CSV reader ([#12022](https://github.com/rapidsai/cudf/pull/12022)) [@karthikeyann](https://github.com/karthikeyann)
+- byte_range support for JSON Lines format ([#12017](https://github.com/rapidsai/cudf/pull/12017)) [@karthikeyann](https://github.com/karthikeyann)
+- Minor cleanup of root CMakeLists.txt for better organization ([#11988](https://github.com/rapidsai/cudf/pull/11988)) [@robertmaynard](https://github.com/robertmaynard)
+- Add inplace arithmetic operators to `MaskedType` ([#11987](https://github.com/rapidsai/cudf/pull/11987)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Implement JNI for chunked Parquet reader ([#11961](https://github.com/rapidsai/cudf/pull/11961)) [@ttnghia](https://github.com/ttnghia)
+- Add method argument to DataFrame.quantile ([#11957](https://github.com/rapidsai/cudf/pull/11957)) [@rjzamora](https://github.com/rjzamora)
+- Add gpu memory watermark apis to JNI ([#11950](https://github.com/rapidsai/cudf/pull/11950)) [@abellina](https://github.com/abellina)
+- Adds retryCount to RmmEventHandler.onAllocFailure ([#11940](https://github.com/rapidsai/cudf/pull/11940)) [@abellina](https://github.com/abellina)
+- Enable returning string data from UDFs used through `apply` ([#11933](https://github.com/rapidsai/cudf/pull/11933)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- Switch over to rapids-cmake patches for thrust ([#11921](https://github.com/rapidsai/cudf/pull/11921)) [@robertmaynard](https://github.com/robertmaynard)
+- Add strings udf C++ classes and functions for phase II ([#11912](https://github.com/rapidsai/cudf/pull/11912)) [@davidwendt](https://github.com/davidwendt)
+- Trim quotes for non-string values in nested json parsing ([#11898](https://github.com/rapidsai/cudf/pull/11898)) [@karthikeyann](https://github.com/karthikeyann)
+- Enable CEC for `strings_udf` ([#11884](https://github.com/rapidsai/cudf/pull/11884)) [@brandon-b-miller](https://github.com/brandon-b-miller)
+- ArrowIPCTableWriter writes en empty batch in the case of an empty table. ([#11883](https://github.com/rapidsai/cudf/pull/11883)) [@firestarman](https://github.com/firestarman)
+- Implement chunked Parquet reader ([#11867](https://github.com/rapidsai/cudf/pull/11867)) [@ttnghia](https://github.com/ttnghia)
+- Add `read_orc_metadata` to libcudf ([#11815](https://github.com/rapidsai/cudf/pull/11815)) [@vuule](https://github.com/vuule)
+- Support nested types as groupby keys in libcudf ([#11792](https://github.com/rapidsai/cudf/pull/11792)) [@PointKernel](https://github.com/PointKernel)
+- Adding feature Truncate to DataFrame and Series ([#11435](https://github.com/rapidsai/cudf/pull/11435)) [@VamsiTallam95](https://github.com/VamsiTallam95)
+
+## 🛠️ Improvements
+
+- Reduce number of tests marked `spilling` ([#12197](https://github.com/rapidsai/cudf/pull/12197)) [@madsbk](https://github.com/madsbk)
+- Pin `dask` and `distributed` for release ([#12165](https://github.com/rapidsai/cudf/pull/12165)) [@galipremsagar](https://github.com/galipremsagar)
+- Don&#39;t rely on GNU find in headers_test.sh ([#12164](https://github.com/rapidsai/cudf/pull/12164)) [@wence-](https://github.com/wence-)
+- Update cp.clip call ([#12148](https://github.com/rapidsai/cudf/pull/12148)) [@quasiben](https://github.com/quasiben)
+- Enable automatic column projection in groupby().agg ([#12124](https://github.com/rapidsai/cudf/pull/12124)) [@rjzamora](https://github.com/rjzamora)
+- Refactor `purge_nonempty_nulls` ([#12111](https://github.com/rapidsai/cudf/pull/12111)) [@ttnghia](https://github.com/ttnghia)
+- Create an `int8` column in `read_csv` when all elements are missing ([#12110](https://github.com/rapidsai/cudf/pull/12110)) [@vuule](https://github.com/vuule)
+- Spilling to host memory ([#12106](https://github.com/rapidsai/cudf/pull/12106)) [@madsbk](https://github.com/madsbk)
+- First pass of `pd.read_orc` changes in tests ([#12103](https://github.com/rapidsai/cudf/pull/12103)) [@galipremsagar](https://github.com/galipremsagar)
+- Expose engine argument in dask_cudf.read_json ([#12101](https://github.com/rapidsai/cudf/pull/12101)) [@rjzamora](https://github.com/rjzamora)
+- Remove CUDA 10 compatibility code. ([#12088](https://github.com/rapidsai/cudf/pull/12088)) [@bdice](https://github.com/bdice)
+- Move and update `dask` nigthly install in CI ([#12082](https://github.com/rapidsai/cudf/pull/12082)) [@galipremsagar](https://github.com/galipremsagar)
+- Throw an error when libcudf is built without cuFile and `LIBCUDF_CUFILE_POLICY` is set to `&quot;ALWAYS&quot;` ([#12080](https://github.com/rapidsai/cudf/pull/12080)) [@vuule](https://github.com/vuule)
+- Remove macros that inspect the contents of exceptions ([#12076](https://github.com/rapidsai/cudf/pull/12076)) [@vyasr](https://github.com/vyasr)
+- Fix ingest_raw_data performance issue in Nested JSON reader due to RVO ([#12070](https://github.com/rapidsai/cudf/pull/12070)) [@karthikeyann](https://github.com/karthikeyann)
+- Remove overflow error during decimal binops ([#12063](https://github.com/rapidsai/cudf/pull/12063)) [@galipremsagar](https://github.com/galipremsagar)
+- Change cudf::detail::tdigest to cudf::tdigest::detail ([#12050](https://github.com/rapidsai/cudf/pull/12050)) [@davidwendt](https://github.com/davidwendt)
+- Fix quantile gtests coded in namespace cudf::test ([#12049](https://github.com/rapidsai/cudf/pull/12049)) [@davidwendt](https://github.com/davidwendt)
+- Add support for `DataFrame.from_dict`\`to_dict` and `Series.to_dict` ([#12048](https://github.com/rapidsai/cudf/pull/12048)) [@galipremsagar](https://github.com/galipremsagar)
+- Refactor Parquet reader ([#12046](https://github.com/rapidsai/cudf/pull/12046)) [@ttnghia](https://github.com/ttnghia)
+- Forward merge 22.10 into 22.12 ([#12045](https://github.com/rapidsai/cudf/pull/12045)) [@vyasr](https://github.com/vyasr)
+- Standardize newlines at ends of files. ([#12042](https://github.com/rapidsai/cudf/pull/12042)) [@bdice](https://github.com/bdice)
+- Trim trailing whitespace from all files. ([#12041](https://github.com/rapidsai/cudf/pull/12041)) [@bdice](https://github.com/bdice)
+- Use nosync policy in gather and scatter implementations. ([#12038](https://github.com/rapidsai/cudf/pull/12038)) [@bdice](https://github.com/bdice)
+- Remove smart quotes from all docstrings. ([#12035](https://github.com/rapidsai/cudf/pull/12035)) [@bdice](https://github.com/bdice)
+- Update cuda-python dependency to 11.7.1 ([#12030](https://github.com/rapidsai/cudf/pull/12030)) [@galipremsagar](https://github.com/galipremsagar)
+- Add cython-lint to pre-commit checks. ([#12020](https://github.com/rapidsai/cudf/pull/12020)) [@bdice](https://github.com/bdice)
+- Use pragma once ([#12019](https://github.com/rapidsai/cudf/pull/12019)) [@bdice](https://github.com/bdice)
+- New GHA to add issues/prs to project board ([#12016](https://github.com/rapidsai/cudf/pull/12016)) [@jarmak-nv](https://github.com/jarmak-nv)
+- Add DataFrame.pivot_table. ([#12015](https://github.com/rapidsai/cudf/pull/12015)) [@bdice](https://github.com/bdice)
+- Rollback of `DeviceBufferLike` ([#12009](https://github.com/rapidsai/cudf/pull/12009)) [@madsbk](https://github.com/madsbk)
+- Remove default parameters for nvtext::detail functions ([#12007](https://github.com/rapidsai/cudf/pull/12007)) [@davidwendt](https://github.com/davidwendt)
+- Remove default parameters for cudf::dictionary::detail functions ([#12006](https://github.com/rapidsai/cudf/pull/12006)) [@davidwendt](https://github.com/davidwendt)
+- Remove unused `managed_allocator` ([#12005](https://github.com/rapidsai/cudf/pull/12005)) [@vyasr](https://github.com/vyasr)
+- Remove default parameters for cudf::strings::detail functions ([#12003](https://github.com/rapidsai/cudf/pull/12003)) [@davidwendt](https://github.com/davidwendt)
+- Remove unnecessary code from dask-cudf _Frame ([#12001](https://github.com/rapidsai/cudf/pull/12001)) [@rjzamora](https://github.com/rjzamora)
+- Ignore python docs build artifacts ([#12000](https://github.com/rapidsai/cudf/pull/12000)) [@galipremsagar](https://github.com/galipremsagar)
+- Use rapids-cmake for google benchmark. ([#11997](https://github.com/rapidsai/cudf/pull/11997)) [@vyasr](https://github.com/vyasr)
+- Leverage rapids_cython for more automated RPATH handling ([#11996](https://github.com/rapidsai/cudf/pull/11996)) [@vyasr](https://github.com/vyasr)
+- Remove stale labeler ([#11995](https://github.com/rapidsai/cudf/pull/11995)) [@raydouglass](https://github.com/raydouglass)
+- Move protobuf compilation to CMake ([#11986](https://github.com/rapidsai/cudf/pull/11986)) [@vyasr](https://github.com/vyasr)
+- Replace most of preprocessor usage in nvcomp adapter with `constexpr` ([#11980](https://github.com/rapidsai/cudf/pull/11980)) [@vuule](https://github.com/vuule)
+- Add missing noexcepts to column_in_metadata methods ([#11973](https://github.com/rapidsai/cudf/pull/11973)) [@vyasr](https://github.com/vyasr)
+- Pass column names to `write_csv` instead of `table_metadata` pointer ([#11972](https://github.com/rapidsai/cudf/pull/11972)) [@vuule](https://github.com/vuule)
+- Accelerate libcudf segmented sort with CUB segmented sort ([#11969](https://github.com/rapidsai/cudf/pull/11969)) [@davidwendt](https://github.com/davidwendt)
+- Feature/remove default streams ([#11967](https://github.com/rapidsai/cudf/pull/11967)) [@vyasr](https://github.com/vyasr)
+- Add pool memory resource to libcudf basic example ([#11966](https://github.com/rapidsai/cudf/pull/11966)) [@davidwendt](https://github.com/davidwendt)
+- Fix some libcudf calls to cudf::detail::gather ([#11963](https://github.com/rapidsai/cudf/pull/11963)) [@davidwendt](https://github.com/davidwendt)
+- Accept const refs instead of const unique_ptr refs in reduce and scan APIs. ([#11960](https://github.com/rapidsai/cudf/pull/11960)) [@vyasr](https://github.com/vyasr)
+- Add deprecation warning for set_allocator. ([#11958](https://github.com/rapidsai/cudf/pull/11958)) [@vyasr](https://github.com/vyasr)
+- Fix lists and structs gtests coded in namespace cudf::test ([#11956](https://github.com/rapidsai/cudf/pull/11956)) [@davidwendt](https://github.com/davidwendt)
+- Add full page indexes to Parquet writer benchmarks ([#11955](https://github.com/rapidsai/cudf/pull/11955)) [@etseidl](https://github.com/etseidl)
+- Use gather-based strings factory in cudf::strings::strip ([#11954](https://github.com/rapidsai/cudf/pull/11954)) [@davidwendt](https://github.com/davidwendt)
+- Default to equal NaNs in make_merge_sets_aggregation. ([#11952](https://github.com/rapidsai/cudf/pull/11952)) [@bdice](https://github.com/bdice)
+- Add `strip_delimiters` option to `read_text` ([#11946](https://github.com/rapidsai/cudf/pull/11946)) [@upsj](https://github.com/upsj)
+- Refactor multibyte_split `output_builder` ([#11945](https://github.com/rapidsai/cudf/pull/11945)) [@upsj](https://github.com/upsj)
+- Remove validation that requires introspection ([#11938](https://github.com/rapidsai/cudf/pull/11938)) [@vyasr](https://github.com/vyasr)
+- Add `.str.find_multiple` API ([#11928](https://github.com/rapidsai/cudf/pull/11928)) [@galipremsagar](https://github.com/galipremsagar)
+- Add regex_program class for use with all regex APIs ([#11927](https://github.com/rapidsai/cudf/pull/11927)) [@davidwendt](https://github.com/davidwendt)
+- Enable backend dispatching for Dask-DataFrame creation ([#11920](https://github.com/rapidsai/cudf/pull/11920)) [@rjzamora](https://github.com/rjzamora)
+- Performance improvement in JSON Tree traversal ([#11919](https://github.com/rapidsai/cudf/pull/11919)) [@karthikeyann](https://github.com/karthikeyann)
+- Fix some gtests incorrectly coded in namespace cudf::test (part I) ([#11917](https://github.com/rapidsai/cudf/pull/11917)) [@davidwendt](https://github.com/davidwendt)
+- Refactor pad/zfill functions for reuse with strings udf ([#11914](https://github.com/rapidsai/cudf/pull/11914)) [@davidwendt](https://github.com/davidwendt)
+- Add `nanosecond` &amp; `microsecond` to `DatetimeProperties` ([#11911](https://github.com/rapidsai/cudf/pull/11911)) [@galipremsagar](https://github.com/galipremsagar)
+- Pin mimesis version in setup.py. ([#11906](https://github.com/rapidsai/cudf/pull/11906)) [@bdice](https://github.com/bdice)
+- Error on `ListColumn` or any new unsupported column in `cudf.Index` ([#11902](https://github.com/rapidsai/cudf/pull/11902)) [@galipremsagar](https://github.com/galipremsagar)
+- Add thrust output iterator fix (1805) to thrust.patch ([#11900](https://github.com/rapidsai/cudf/pull/11900)) [@davidwendt](https://github.com/davidwendt)
+- Relax `codecov` threshold diff ([#11899](https://github.com/rapidsai/cudf/pull/11899)) [@galipremsagar](https://github.com/galipremsagar)
+- Use public APIs in STREAM_COMPACTION_NVBENCH ([#11892](https://github.com/rapidsai/cudf/pull/11892)) [@GregoryKimball](https://github.com/GregoryKimball)
+- Add coverage for string UDF tests. ([#11891](https://github.com/rapidsai/cudf/pull/11891)) [@vyasr](https://github.com/vyasr)
+- Provide `data_chunk_source` wrapper for `datasource` ([#11886](https://github.com/rapidsai/cudf/pull/11886)) [@upsj](https://github.com/upsj)
+- Handle `multibyte_split` byte_range out-of-bounds offsets on host ([#11885](https://github.com/rapidsai/cudf/pull/11885)) [@upsj](https://github.com/upsj)
+- Add tests ensuring that cudf&#39;s default stream is always used ([#11875](https://github.com/rapidsai/cudf/pull/11875)) [@vyasr](https://github.com/vyasr)
+- Change expect_strings_empty into expect_column_empty libcudf test utility ([#11873](https://github.com/rapidsai/cudf/pull/11873)) [@davidwendt](https://github.com/davidwendt)
+- Add ngroup ([#11871](https://github.com/rapidsai/cudf/pull/11871)) [@shwina](https://github.com/shwina)
+- Reduce memory usage in nested JSON parser - tree generation ([#11864](https://github.com/rapidsai/cudf/pull/11864)) [@karthikeyann](https://github.com/karthikeyann)
+- Unpin `dask` and `distributed` for development ([#11859](https://github.com/rapidsai/cudf/pull/11859)) [@galipremsagar](https://github.com/galipremsagar)
+- Remove unused includes for table/row_operators ([#11857](https://github.com/rapidsai/cudf/pull/11857)) [@GregoryKimball](https://github.com/GregoryKimball)
+- Use conda-forge&#39;s `pyorc` ([#11855](https://github.com/rapidsai/cudf/pull/11855)) [@jakirkham](https://github.com/jakirkham)
+- Add libcudf strings examples ([#11849](https://github.com/rapidsai/cudf/pull/11849)) [@davidwendt](https://github.com/davidwendt)
+- Remove `cudf_io` namespace alias ([#11827](https://github.com/rapidsai/cudf/pull/11827)) [@vuule](https://github.com/vuule)
+- Test/remove thrust vector usage ([#11813](https://github.com/rapidsai/cudf/pull/11813)) [@vyasr](https://github.com/vyasr)
+- Add BGZIP reader to python `read_text` ([#11802](https://github.com/rapidsai/cudf/pull/11802)) [@upsj](https://github.com/upsj)
+- Merge branch-22.10 into branch-22.12 ([#11801](https://github.com/rapidsai/cudf/pull/11801)) [@davidwendt](https://github.com/davidwendt)
+- Fix compile warning from CUDF_FUNC_RANGE in a member function ([#11798](https://github.com/rapidsai/cudf/pull/11798)) [@davidwendt](https://github.com/davidwendt)
+- Update cudf JNI version to 22.12.0-SNAPSHOT ([#11764](https://github.com/rapidsai/cudf/pull/11764)) [@pxLi](https://github.com/pxLi)
+- Update flake8 to 5.0.4 and use flake8-force to check Cython. ([#11736](https://github.com/rapidsai/cudf/pull/11736)) [@bdice](https://github.com/bdice)
+- Add BGZIP multibyte_split benchmark ([#11723](https://github.com/rapidsai/cudf/pull/11723)) [@upsj](https://github.com/upsj)
+- Bifurcate Dependency Lists ([#11674](https://github.com/rapidsai/cudf/pull/11674)) [@bdice](https://github.com/bdice)
+- Default to equal NaNs in make_collect_set_aggregation. ([#11621](https://github.com/rapidsai/cudf/pull/11621)) [@bdice](https://github.com/bdice)
+- Conform &quot;bench_isin&quot; to match generator column names ([#11549](https://github.com/rapidsai/cudf/pull/11549)) [@GregoryKimball](https://github.com/GregoryKimball)
+- Removing int8 column option from parquet byte_array writing ([#11539](https://github.com/rapidsai/cudf/pull/11539)) [@hyperbolic2346](https://github.com/hyperbolic2346)
+- Add checks for HLG layers in dask-cudf groupby tests ([#10853](https://github.com/rapidsai/cudf/pull/10853)) [@charlesbluca](https://github.com/charlesbluca)
+- part1: Simplify BaseIndex to an abstract class ([#10389](https://github.com/rapidsai/cudf/pull/10389)) [@skirui-source](https://github.com/skirui-source)
+- Make all `nvcc` warnings into errors ([#8916](https://github.com/rapidsai/cudf/pull/8916)) [@trxcllnt](https://github.com/trxcllnt)
+
 # cuDF 22.10.00 (12 Oct 2022)
 
 ## 🚨 Breaking Changes
diff --git a/CODE_OF_CONDUCT.md b/CODE_OF_CONDUCT.md
deleted file mode 100644
index 3029fbb41af..00000000000
--- a/CODE_OF_CONDUCT.md
+++ /dev/null
@@ -1 +0,0 @@
-This project has adopted the [Contributor Covenant Code of Conduct](https://docs.rapids.ai/resources/conduct/). 
diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md
index 6eb621abcc3..608bd42d86c 100644
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -99,13 +99,13 @@ cd $CUDF_HOME
 **Note:** Using a conda environment is the easiest way to satisfy the library's dependencies.
 Instructions for a minimal build environment without conda are included below.
 
-- Create the conda development environment `cudf_dev`:
+- Create the conda development environment:
 
 ```bash
 # create the conda environment (assuming in base `cudf` directory)
 # note: RAPIDS currently doesn't support `channel_priority: strict`;
 # use `channel_priority: flexible` instead
-conda env create --name cudf_dev --file conda/environments/cudf_dev_cuda11.5.yml
+conda env create --name cudf_dev --file conda/environments/all_cuda-115_arch-x86_64.yaml
 # activate the environment
 conda activate cudf_dev
 ```
@@ -114,9 +114,6 @@ conda activate cudf_dev
   development environment may also need to be updated if dependency versions or
   pinnings are changed.
 
-- For other CUDA versions, check the corresponding `cudf_dev_cuda*.yml` file in
-  `conda/environments/`.
-
 #### Building without a conda environment
 
 - libcudf has the following minimal dependencies (in addition to those listed in the [General
@@ -382,7 +379,7 @@ You can skip these checks with `git commit --no-verify` or with the short versio
 
 ## Developer Guidelines
 
-The [C++ Developer Guide](cpp/docs/DEVELOPER_GUIDE.md) includes details on contributing to libcudf C++ code.
+The [C++ Developer Guide](cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md) includes details on contributing to libcudf C++ code.
 
 The [Python Developer Guide](https://docs.rapids.ai/api/cudf/stable/developer_guide/index.html) includes details on contributing to cuDF Python code.
 
diff --git a/README.md b/README.md
index 641ce1316b3..a013d3a9ea4 100644
--- a/README.md
+++ b/README.md
@@ -50,7 +50,7 @@ For additional examples, browse our complete [API documentation](https://docs.ra
 
 ## Quick Start
 
-Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you’re running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize cuDF.
+Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you're running. This provides a ready to run Docker container with example notebooks and data, showcasing how you can utilize cuDF.
 
 ## Installation
 
diff --git a/build.sh b/build.sh
index bda3d83798a..e62da9791da 100755
--- a/build.sh
+++ b/build.sh
@@ -64,7 +64,7 @@ BUILD_BENCHMARKS=OFF
 BUILD_ALL_GPU_ARCH=0
 BUILD_NVTX=ON
 BUILD_TESTS=OFF
-BUILD_DISABLE_DEPRECATION_WARNING=ON
+BUILD_DISABLE_DEPRECATION_WARNINGS=ON
 BUILD_PER_THREAD_DEFAULT_STREAM=OFF
 BUILD_REPORT_METRICS=OFF
 BUILD_REPORT_INCL_CACHE_STATS=OFF
@@ -216,7 +216,7 @@ if hasArg --opensource_nvcomp; then
     USE_PROPRIETARY_NVCOMP="OFF"
 fi
 if hasArg --show_depr_warn; then
-    BUILD_DISABLE_DEPRECATION_WARNING=OFF
+    BUILD_DISABLE_DEPRECATION_WARNINGS=OFF
 fi
 if hasArg --ptds; then
     BUILD_PER_THREAD_DEFAULT_STREAM=ON
@@ -285,7 +285,7 @@ if buildAll || hasArg libcudf; then
           -DCUDF_USE_PROPRIETARY_NVCOMP=${USE_PROPRIETARY_NVCOMP} \
           -DBUILD_TESTS=${BUILD_TESTS} \
           -DBUILD_BENCHMARKS=${BUILD_BENCHMARKS} \
-          -DDISABLE_DEPRECATION_WARNING=${BUILD_DISABLE_DEPRECATION_WARNING} \
+          -DDISABLE_DEPRECATION_WARNINGS=${BUILD_DISABLE_DEPRECATION_WARNINGS} \
           -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=${BUILD_PER_THREAD_DEFAULT_STREAM} \
           -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \
           ${EXTRA_CMAKE_ARGS}
diff --git a/ci/benchmark/build.sh b/ci/benchmark/build.sh
index ffa48797fe3..e186946a3d0 100755
--- a/ci/benchmark/build.sh
+++ b/ci/benchmark/build.sh
@@ -40,7 +40,7 @@ export LIBCUDF_KERNEL_CACHE_PATH="$HOME/.jitify-cache"
 export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.9.2"
+export DASK_STABLE_VERSION="2022.11.1"
 
 function remove_libcudf_kernel_cache_dir {
     EXITCODE=$?
@@ -82,8 +82,8 @@ conda install "rmm=$MINOR_VERSION.*" "cudatoolkit=$CUDA_REL" \
 
 # Install the conda-forge or nightly version of dask and distributed
 if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
-    gpuci_logger "gpuci_mamba_retry update dask"
-    gpuci_mamba_retry update dask
+    gpuci_logger "gpuci_mamba_retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
+    gpuci_mamba_retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
 else
     gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
     gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall
diff --git a/ci/checks/copyright.py b/ci/checks/copyright.py
index 61e30d7922e..83f43183f71 100644
--- a/ci/checks/copyright.py
+++ b/ci/checks/copyright.py
@@ -68,20 +68,40 @@ def modifiedFiles():
     we can read only the staged changes.
     """
     repo = git.Repo()
-    # TARGET_BRANCH is defined in CI
+    # Use the environment variable TARGET_BRANCH (defined in CI) if possible
     target_branch = os.environ.get("TARGET_BRANCH")
     if target_branch is None:
         # Fall back to the closest branch if not on CI
         target_branch = repo.git.describe(
             all=True, tags=True, match="branch-*", abbrev=0
         ).lstrip("heads/")
-    try:
-        # Use the tracking branch of the local reference if it exists
+
+    upstream_target_branch = None
+    if target_branch in repo.heads:
+        # Use the tracking branch of the local reference if it exists. This
+        # returns None if no tracking branch is set.
         upstream_target_branch = repo.heads[target_branch].tracking_branch()
-    except IndexError:
-        # Fall back to the remote reference (this happens on CI because the
-        # only local branch reference is current-pr-branch)
-        upstream_target_branch = repo.remote().refs[target_branch]
+    if upstream_target_branch is None:
+        # Fall back to the remote with the newest target_branch. This code
+        # path is used on CI because the only local branch reference is
+        # current-pr-branch, and thus target_branch is not in repo.heads.
+        # This also happens if no tracking branch is defined for the local
+        # target_branch. We use the remote with the latest commit if
+        # multiple remotes are defined.
+        candidate_branches = [
+            remote.refs[target_branch] for remote in repo.remotes
+            if target_branch in remote.refs
+        ]
+        if len(candidate_branches) > 0:
+            upstream_target_branch = sorted(
+                candidate_branches,
+                key=lambda branch: branch.commit.committed_datetime,
+            )[-1]
+        else:
+            # If no remotes are defined, try to use the local version of the
+            # target_branch. If this fails, the repo configuration must be very
+            # strange and we can fix this script on a case-by-case basis.
+            upstream_target_branch = repo.heads[target_branch]
     merge_base = repo.merge_base("HEAD", upstream_target_branch.commit)[0]
     diff = merge_base.diff()
     changed_files = {f for f in diff if f.b_path is not None}
diff --git a/ci/checks/headers_test.sh b/ci/checks/headers_test.sh
index 502bdca0fa7..b859009a8c5 100755
--- a/ci/checks/headers_test.sh
+++ b/ci/checks/headers_test.sh
@@ -10,7 +10,7 @@ DIRNAMES="cudf cudf_test"
 
 # existence tests for lib${LIBNAME}
 for DIRNAME in ${DIRNAMES[@]}; do
-    HEADERS=`cd cpp && find include/${DIRNAME}/ -type f \( -iname "*.h" -o  -iname "*.hpp" \) -printf "        - test -f \\\$PREFIX/%p\n" | sort`
+    HEADERS=`cd cpp && find include/${DIRNAME} -type f \( -iname "*.h" -o  -iname "*.hpp" \) -print | sed 's|^|        - test -f $PREFIX/|' | sort`
     META_TESTS=`grep -E "test -f .*/include/${DIRNAME}/.*\.h(pp)?" conda/recipes/lib${LIBNAME}/meta.yaml | sort`
     HEADER_DIFF=`diff <(echo "$HEADERS") <(echo "$META_TESTS")`
     LIB_RETVAL=$?
diff --git a/ci/checks/style.sh b/ci/checks/style.sh
index 680321378c0..54cf3928cf4 100755
--- a/ci/checks/style.sh
+++ b/ci/checks/style.sh
@@ -14,7 +14,7 @@ LANG=C.UTF-8
 . /opt/conda/etc/profile.d/conda.sh
 conda activate rapids
 
-FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/cmake-format-rapids-cmake.json
+FORMAT_FILE_URL=https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/cmake-format-rapids-cmake.json
 export RAPIDS_CMAKE_FORMAT_FILE=/tmp/rapids_cmake_ci/cmake-formats-rapids-cmake.json
 mkdir -p $(dirname ${RAPIDS_CMAKE_FORMAT_FILE})
 wget -O ${RAPIDS_CMAKE_FORMAT_FILE} ${FORMAT_FILE_URL}
diff --git a/ci/cpu/build.sh b/ci/cpu/build.sh
index 560de6db187..2e12308169f 100755
--- a/ci/cpu/build.sh
+++ b/ci/cpu/build.sh
@@ -130,7 +130,7 @@ if [ "$BUILD_CUDF" == '1' ]; then
 
   gpuci_logger "Build conda pkg for custreamz"
   gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/custreamz --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
-  
+
   gpuci_logger "Build conda pkg for strings_udf"
   gpuci_conda_retry mambabuild --croot ${CONDA_BLD_DIR} conda/recipes/strings_udf --python=$PYTHON $CONDA_BUILD_ARGS $CONDA_CHANNEL
 
diff --git a/ci/gpu/build.sh b/ci/gpu/build.sh
index b6a27c31614..3a65130f922 100755
--- a/ci/gpu/build.sh
+++ b/ci/gpu/build.sh
@@ -35,10 +35,10 @@ unset GIT_DESCRIBE_TAG
 export INSTALL_DASK_MAIN=0
 
 # Dask version to install when `INSTALL_DASK_MAIN=0`
-export DASK_STABLE_VERSION="2022.9.2"
+export DASK_STABLE_VERSION="2022.11.1"
 
 # ucx-py version
-export UCX_PY_VERSION='0.28.*'
+export UCX_PY_VERSION='0.29.*'
 
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
@@ -96,12 +96,12 @@ function install_dask {
     gpuci_logger "Install the conda-forge or nightly version of dask and distributed"
     set -x
     if [[ "${INSTALL_DASK_MAIN}" == 1 ]]; then
-        gpuci_logger "gpuci_mamba_retry update dask"
-        gpuci_mamba_retry update dask
+        gpuci_logger "gpuci_mamba_retry install -c dask/label/dev 'dask/label/dev::dask' 'dask/label/dev::distributed'"
+        gpuci_mamba_retry install -c dask/label/dev "dask/label/dev::dask" "dask/label/dev::distributed"
         conda list
     else
         gpuci_logger "gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall"
-        gpuci_mamba_retry install conda-forge::dask=={$DASK_STABLE_VERSION} conda-forge::distributed=={$DASK_STABLE_VERSION} conda-forge::dask-core=={$DASK_STABLE_VERSION} --force-reinstall
+        gpuci_mamba_retry install conda-forge::dask==$DASK_STABLE_VERSION conda-forge::distributed==$DASK_STABLE_VERSION conda-forge::dask-core==$DASK_STABLE_VERSION --force-reinstall
     fi
     # Install the main version of streamz
     gpuci_logger "Install the main version of streamz"
@@ -111,6 +111,8 @@ function install_dask {
     set +x
 }
 
+install_dask
+
 if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
 
     gpuci_logger "Install dependencies"
@@ -126,8 +128,6 @@ if [[ -z "$PROJECT_FLASH" || "$PROJECT_FLASH" == "0" ]]; then
     # gpuci_conda_retry remove --force rapids-build-env rapids-notebook-env
     # gpuci_mamba_retry install -y "your-pkg=1.0.0"
 
-    install_dask
-
     ################################################################################
     # BUILD - Build libcudf, cuDF, libcudf_kafka, dask_cudf, and strings_udf from source
     ################################################################################
@@ -197,16 +197,31 @@ else
     # copied by CI from the upstream 11.5 jobs into $CONDA_ARTIFACT_PATH
     gpuci_logger "Installing cudf, dask-cudf, cudf_kafka, and custreamz"
     gpuci_mamba_retry install cudf dask-cudf cudf_kafka custreamz -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}"
-    
+
     gpuci_logger "Check current conda environment"
     conda list --show-channel-urls
 
     gpuci_logger "GoogleTests"
+
+    # Set up library for finding incorrect default stream usage.
+    cd "$WORKSPACE/cpp/tests/utilities/identify_stream_usage/"
+    mkdir build && cd build && cmake .. -GNinja && ninja && ninja test
+    STREAM_IDENTIFY_LIB="$WORKSPACE/cpp/tests/utilities/identify_stream_usage/build/libidentify_stream_usage.so"
+
     # Run libcudf and libcudf_kafka gtests from libcudf-tests package
     for gt in "$CONDA_PREFIX/bin/gtests/libcudf"*/* ; do
         test_name=$(basename ${gt})
+
         echo "Running GoogleTest $test_name"
-        ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
+        if [[ ${test_name} == "SPAN_TEST" ]]; then
+            # This one test is specifically designed to test using a thrust device
+            # vector, so we expect and allow it to include default stream usage.
+            gtest_filter="SpanTest.CanConstructFromDeviceContainers"
+            GTEST_CUDF_STREAM_MODE="custom" LD_PRELOAD=${STREAM_IDENTIFY_LIB} ${gt} --gtest_output=xml:"$WORKSPACE/test-results/" --gtest_filter="-${gtest_filter}"
+            ${gt} --gtest_output=xml:"$WORKSPACE/test-results/" --gtest_filter="${gtest_filter}"
+        else
+            GTEST_CUDF_STREAM_MODE="custom" LD_PRELOAD=${STREAM_IDENTIFY_LIB} ${gt} --gtest_output=xml:"$WORKSPACE/test-results/"
+        fi
     done
 
     # Test libcudf (csv, orc, and parquet) with `LIBCUDF_CUFILE_POLICY=KVIKIO`
@@ -267,6 +282,10 @@ conda list
 gpuci_logger "Python py.test for cuDF"
 py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope tests
 
+gpuci_logger "Python py.tests for cuDF with spilling (CUDF_SPILL_DEVICE_LIMIT=1)"
+# Due to time concerns, we only run tests marked "spilling"
+CUDF_SPILL=on CUDF_SPILL_DEVICE_LIMIT=1 py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov-append --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-coverage.xml" --cov-report term --dist=loadscope -m spilling tests
+
 cd "$WORKSPACE/python/dask_cudf"
 gpuci_logger "Python py.test for dask-cudf"
 py.test -n 8 --cache-clear --basetemp="$WORKSPACE/dask-cudf-cuda-tmp" --junitxml="$WORKSPACE/junit-dask-cudf.xml" -v --cov-config=.coveragerc --cov=dask_cudf --cov-report=xml:"$WORKSPACE/python/dask_cudf/dask-cudf-coverage.xml" --cov-report term dask_cudf
@@ -280,22 +299,15 @@ py.test -n 8 --cache-clear --basetemp="$WORKSPACE/custreamz-cuda-tmp" --junitxml
 gpuci_logger "Installing strings_udf"
 gpuci_mamba_retry install strings_udf -c "${CONDA_BLD_DIR}" -c "${CONDA_ARTIFACT_PATH}"
 
-# only install strings_udf after cuDF is finished testing without its presence
 cd "$WORKSPACE/python/strings_udf/strings_udf"
 gpuci_logger "Python py.test for strings_udf"
+py.test -n 8 --cache-clear --basetemp="$WORKSPACE/strings-udf-cuda-tmp" --junitxml="$WORKSPACE/junit-strings-udf.xml" -v --cov-config=.coveragerc --cov=strings_udf --cov-report=xml:"$WORKSPACE/python/strings_udf/strings-udf-coverage.xml" --cov-report term tests
 
-STRINGS_UDF_PYTEST_RETCODE=0
-py.test -n 8 --cache-clear --basetemp="$WORKSPACE/strings-udf-cuda-tmp" --junitxml="$WORKSPACE/junit-strings-udf.xml" -v --cov-config=.coveragerc --cov=strings_udf --cov-report=xml:"$WORKSPACE/python/strings_udf/strings-udf-coverage.xml" --cov-report term tests || STRINGS_UDF_PYTEST_RETCODE=$?
+# retest cuDF UDFs
+cd "$WORKSPACE/python/cudf/cudf"
+gpuci_logger "Python py.test retest cuDF UDFs"
+py.test -n 8 --cache-clear --basetemp="$WORKSPACE/cudf-cuda-strings-udf-tmp" --ignore="$WORKSPACE/python/cudf/cudf/benchmarks" --junitxml="$WORKSPACE/junit-cudf-strings-udf.xml" -v --cov-config="$WORKSPACE/python/cudf/.coveragerc" --cov=cudf --cov-report=xml:"$WORKSPACE/python/cudf/cudf-strings-udf-coverage.xml" --cov-report term --dist=loadscope tests/test_udf_masked_ops.py
 
-if [ ${STRINGS_UDF_PYTEST_RETCODE} -eq 5 ]; then
-    echo "No strings UDF tests were run, but this script will continue to execute."
-elif [ ${STRINGS_UDF_PYTEST_RETCODE} -ne 0 ]; then
-    exit ${STRINGS_UDF_PYTEST_RETCODE}
-else
-    cd "$WORKSPACE/python/cudf/cudf"
-    gpuci_logger "Python py.test retest cuDF UDFs"
-    py.test tests/test_udf_masked_ops.py -n 8 --cache-clear
-fi
 
 # Run benchmarks with both cudf and pandas to ensure compatibility is maintained.
 # Benchmarks are run in DEBUG_ONLY mode, meaning that only small data sizes are used.
diff --git a/ci/gpu/java.sh b/ci/gpu/java.sh
index b110303662b..e1d3bab2bc5 100755
--- a/ci/gpu/java.sh
+++ b/ci/gpu/java.sh
@@ -31,7 +31,7 @@ export GIT_DESCRIBE_TAG=`git describe --tags`
 export MINOR_VERSION=`echo $GIT_DESCRIBE_TAG | grep -o -E '([0-9]+\.[0-9]+)'`
 
 # ucx-py version
-export UCX_PY_VERSION='0.28.*'
+export UCX_PY_VERSION='0.29.*'
 
 ################################################################################
 # TRAP - Setup trap for removing jitify cache
diff --git a/ci/release/update-version.sh b/ci/release/update-version.sh
index 8fad4e08c56..9dcfe093643 100755
--- a/ci/release/update-version.sh
+++ b/ci/release/update-version.sh
@@ -34,6 +34,9 @@ function sed_runner() {
 # cpp update
 sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/CMakeLists.txt
 
+# cpp stream testing update
+sed_runner 's/'"VERSION ${CURRENT_SHORT_TAG}.*"'/'"VERSION ${NEXT_FULL_TAG}"'/g' cpp/tests/utilities/identify_stream_usage/CMakeLists.txt
+
 # Python update
 sed_runner 's/'"cudf_version .*)"'/'"cudf_version ${NEXT_FULL_TAG})"'/g' python/cudf/CMakeLists.txt
 
@@ -60,9 +63,10 @@ sed_runner 's/version = .*/version = '"'${NEXT_SHORT_TAG}'"'/g' docs/cudf/source
 sed_runner 's/release = .*/release = '"'${NEXT_FULL_TAG}'"'/g' docs/cudf/source/conf.py
 
 # bump rmm & dask-cuda
-for FILE in conda/environments/*.yml; do
-  sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
+for FILE in conda/environments/*.yaml dependencies.yaml; do
   sed_runner "s/dask-cuda=${CURRENT_SHORT_TAG}/dask-cuda=${NEXT_SHORT_TAG}/g" ${FILE};
+  sed_runner "s/rmm=${CURRENT_SHORT_TAG}/rmm=${NEXT_SHORT_TAG}/g" ${FILE};
+  sed_runner "s/rmm-cu11=${CURRENT_SHORT_TAG}/rmm-cu11=${NEXT_SHORT_TAG}/g" ${FILE};
 done
 
 # Doxyfile update
@@ -74,6 +78,7 @@ sed_runner "s/cudf=${CURRENT_SHORT_TAG}/cudf=${NEXT_SHORT_TAG}/g" README.md
 
 # Libcudf examples update
 sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/basic/CMakeLists.txt
+sed_runner "s/CUDF_TAG branch-${CURRENT_SHORT_TAG}/CUDF_TAG branch-${NEXT_SHORT_TAG}/" cpp/examples/strings/CMakeLists.txt
 
 # ucx-py version update
 sed_runner "s/export UCX_PY_VERSION=.*/export UCX_PY_VERSION='${NEXT_UCX_PY_VERSION}'/g" ci/gpu/build.sh
diff --git a/codecov.yml b/codecov.yml
index f9d0f906807..344d4f3f04e 100644
--- a/codecov.yml
+++ b/codecov.yml
@@ -2,10 +2,10 @@
 coverage:
   status:
     project: off
-    patch: on
+    patch:
       default:
         target: auto
-        threshold: 0%
+        threshold: 5%
 
 github_checks:
     annotations: true
diff --git a/conda/environments/all_cuda-115_arch-x86_64.yaml b/conda/environments/all_cuda-115_arch-x86_64.yaml
new file mode 100644
index 00000000000..cd900efced5
--- /dev/null
+++ b/conda/environments/all_cuda-115_arch-x86_64.yaml
@@ -0,0 +1,78 @@
+# This file is generated by `rapids-dependency-file-generator`.
+# To make changes, edit ../../dependencies.yaml and run `rapids-dependency-file-generator`.
+channels:
+- rapidsai
+- rapidsai-nightly
+- dask/label/dev
+- conda-forge
+- nvidia
+dependencies:
+- aiobotocore>=2.2.0
+- arrow-cpp=9
+- boto3>=1.21.21
+- botocore>=1.24.21
+- c-compiler
+- cachetools
+- cmake>=3.23.1,!=3.25.0
+- cubinlinker
+- cuda-python>=11.7.1,<12.0
+- cudatoolkit=11.5
+- cupy>=9.5.0,<12.0.0a0
+- cxx-compiler
+- cython>=0.29,<0.30
+- dask-cuda=22.12.*
+- dask==2022.11.1
+- distributed==2022.11.1
+- dlpack>=0.5,<0.6.0a0
+- doxygen=1.8.20
+- fastavro>=0.22.9
+- fsspec>=0.6.0
+- gcc_linux-64=9.*
+- hypothesis
+- ipython
+- librdkafka=1.7.0
+- mimesis>=4.1.0
+- moto>=4.0.8
+- myst-nb
+- nbsphinx
+- notebook>=0.5.0
+- numba>=0.56.2
+- numpy
+- numpydoc
+- nvcc_linux-64=11.5
+- nvtx>=0.2.1
+- packaging
+- pandas>=1.0,<1.6.0dev0
+- pandoc<=2.0.0
+- pip
+- pre-commit
+- protobuf>=3.20.1,<3.21.0a0
+- ptxcompiler
+- pyarrow=9.0.0
+- pydata-sphinx-theme
+- pytest
+- pytest-benchmark
+- pytest-cases
+- pytest-cov
+- pytest-xdist
+- python-confluent-kafka=1.7.0
+- python-snappy>=0.6.0
+- python>=3.8,<3.10
+- pytorch<1.12.0
+- rmm=22.12.*
+- s3fs>=2022.3.0
+- scikit-build>=0.13.1
+- scipy
+- sphinx
+- sphinx-autobuild
+- sphinx-copybutton
+- sphinx-markdown-tables
+- sphinxcontrib-websupport
+- streamz
+- sysroot_linux-64==2.17
+- transformers
+- typing_extensions
+- pip:
+  - git+https://github.com/python-streamz/streamz.git@master
+  - pyorc
+name: all_cuda-115_arch-x86_64
diff --git a/conda/environments/cudf_dev_cuda11.5.yml b/conda/environments/cudf_dev_cuda11.5.yml
deleted file mode 100644
index 37df0ba48dc..00000000000
--- a/conda/environments/cudf_dev_cuda11.5.yml
+++ /dev/null
@@ -1,92 +0,0 @@
-# Copyright (c) 2021-2022, NVIDIA CORPORATION.
-
-name: cudf_dev
-channels:
-  - rapidsai
-  - rapidsai-nightly
-  - dask/label/dev
-  - conda-forge
-  - nvidia
-dependencies:
-  - c-compiler
-  - cxx-compiler
-  - clang=11.1.0
-  - clang-tools=11.1.0
-  - cupy>=9.5.0,<12.0.0a0
-  - rmm=22.10.*
-  - cmake>=3.23.1
-  - cmake_setuptools>=0.1.3
-  - scikit-build>=0.13.1
-  - python>=3.8,<3.10
-  - numba>=0.56.2
-  - numpy
-  - pandas>=1.0,<1.6.0dev0
-  - pyarrow=9
-  - fastavro>=0.22.9
-  - python-snappy>=0.6.0
-  - notebook>=0.5.0
-  - cython>=0.29,<0.30
-  - fsspec>=0.6.0
-  - pytest
-  - pytest-benchmark
-  - pytest-cases
-  - pytest-xdist
-  - sphinx
-  - sphinxcontrib-websupport
-  - nbsphinx
-  - numpydoc
-  - ipython
-  - pandoc<=2.0.0
-  - cudatoolkit=11.5
-  - cuda-python>=11.7.1,<12.0
-  - pip
-  - flake8=3.8.3
-  - black=22.3.0
-  - isort=5.10.1
-  - mypy=0.971
-  - types-cachetools
-  - doxygen=1.8.20
-  - pydocstyle=6.1.1
-  - typing_extensions
-  - pre-commit
-  - dask==2022.9.2
-  - distributed==2022.9.2
-  - streamz
-  - arrow-cpp=9
-  - dlpack>=0.5,<0.6.0a0
-  - double-conversion
-  - rapidjson
-  - hypothesis
-  - sphinx-markdown-tables
-  - sphinx-copybutton
-  - sphinx-autobuild
-  - myst-nb
-  - scipy
-  - dask-cuda=22.10.*
-  - mimesis<4.1
-  - packaging
-  - protobuf>=3.20.1,<3.21.0a0
-  - nvtx>=0.2.1
-  - cachetools
-  - transformers<=4.10.3
-  - pydata-sphinx-theme
-  - librdkafka=1.7.0
-  - python-confluent-kafka=1.7.0
-  - moto>=3.1.6
-  - boto3>=1.21.21
-  - botocore>=1.24.21
-  - aiobotocore>=2.2.0
-  - s3fs>=2022.3.0
-  - werkzeug<2.2.0 # Temporary transient dependency pinning to avoid URL-LIB3 + moto timeouts
-  - pytorch<1.12.0
-  - pip:
-      - git+https://github.com/python-streamz/streamz.git@master
-      - pyorc
-  - cubinlinker  # [linux64]
-  - gcc_linux-64=9.* # [linux64]
-  - sysroot_linux-64==2.17 # [linux64]
-  - nvcc_linux-64=11.5
-  # Un-comment following lines for ARM specific packages.
-  # - gcc_linux-aarch64=9.* # [aarch64]
-  # - sysroot_linux-aarch64==2.17 # [aarch64]
-  # - nvcc_linux-aarch64=11.5 # [aarch64]
diff --git a/conda/recipes/cudf/conda_build_config.yaml b/conda/recipes/cudf/conda_build_config.yaml
index 0027a80f1ec..4feac647e8c 100644
--- a/conda/recipes/cudf/conda_build_config.yaml
+++ b/conda/recipes/cudf/conda_build_config.yaml
@@ -8,7 +8,7 @@ sysroot_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.23.1"
+  - ">=3.23.1,!=3.25.0"
 
 cuda_compiler:
   - nvcc
diff --git a/conda/recipes/cudf_kafka/meta.yaml b/conda/recipes/cudf_kafka/meta.yaml
index a65373efec3..4f7a4bbc268 100644
--- a/conda/recipes/cudf_kafka/meta.yaml
+++ b/conda/recipes/cudf_kafka/meta.yaml
@@ -22,7 +22,7 @@ build:
 
 requirements:
   build:
-    - cmake >=3.23.1
+    - cmake >=3.23.1,!=3.25.0
     - {{ compiler('c') }}
     - {{ compiler('cxx') }}
     - sysroot_{{ target_platform }} {{ sysroot_version }}
diff --git a/conda/recipes/custreamz/meta.yaml b/conda/recipes/custreamz/meta.yaml
index 596e5fde940..b5a27cdac92 100644
--- a/conda/recipes/custreamz/meta.yaml
+++ b/conda/recipes/custreamz/meta.yaml
@@ -29,8 +29,8 @@ requirements:
     - python
     - streamz
     - cudf ={{ version }}
-    - dask==2022.9.2
-    - distributed==2022.9.2
+    - dask==2022.11.1
+    - distributed==2022.11.1
     - python-confluent-kafka >=1.7.0,<1.8.0a0
     - cudf_kafka ={{ version }}
 
diff --git a/conda/recipes/dask-cudf/meta.yaml b/conda/recipes/dask-cudf/meta.yaml
index 2d95151018b..d97a8448a53 100644
--- a/conda/recipes/dask-cudf/meta.yaml
+++ b/conda/recipes/dask-cudf/meta.yaml
@@ -24,14 +24,14 @@ requirements:
   host:
     - python
     - cudf ={{ version }}
-    - dask==2022.9.2
-    - distributed==2022.9.2
+    - dask==2022.11.1
+    - distributed==2022.11.1
     - cudatoolkit ={{ cuda_version }}
   run:
     - python
     - cudf ={{ version }}
-    - dask==2022.9.2
-    - distributed==2022.9.2
+    - dask==2022.11.1
+    - distributed==2022.11.1
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
 
 test:                                   # [linux64]
diff --git a/conda/recipes/libcudf/conda_build_config.yaml b/conda/recipes/libcudf/conda_build_config.yaml
index 7f5bf219f1f..5179cb55d84 100644
--- a/conda/recipes/libcudf/conda_build_config.yaml
+++ b/conda/recipes/libcudf/conda_build_config.yaml
@@ -11,7 +11,7 @@ sysroot_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.23.1"
+  - ">=3.23.1,!=3.25.0"
 
 gtest_version:
   - "=1.10.0"
diff --git a/conda/recipes/libcudf/meta.yaml b/conda/recipes/libcudf/meta.yaml
index a417b407044..ceafc44ed10 100644
--- a/conda/recipes/libcudf/meta.yaml
+++ b/conda/recipes/libcudf/meta.yaml
@@ -78,7 +78,6 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/binaryop.hpp
         - test -f $PREFIX/include/cudf/detail/calendrical_month_sequence.cuh
         - test -f $PREFIX/include/cudf/detail/concatenate.hpp
-        - test -f $PREFIX/include/cudf/detail/copy.cuh
         - test -f $PREFIX/include/cudf/detail/copy.hpp
         - test -f $PREFIX/include/cudf/detail/datetime.hpp
         - test -f $PREFIX/include/cudf/detail/fill.hpp
@@ -113,9 +112,11 @@ outputs:
         - test -f $PREFIX/include/cudf/detail/transpose.hpp
         - test -f $PREFIX/include/cudf/detail/unary.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/alignment.hpp
-        - test -f $PREFIX/include/cudf/detail/utilities/linked_column.hpp
+        - test -f $PREFIX/include/cudf/detail/utilities/default_stream.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/int_fastdiv.h
         - test -f $PREFIX/include/cudf/detail/utilities/integer_utils.hpp
+        - test -f $PREFIX/include/cudf/detail/utilities/linked_column.hpp
+        - test -f $PREFIX/include/cudf/detail/utilities/pinned_allocator.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/vector_factories.hpp
         - test -f $PREFIX/include/cudf/detail/utilities/visitor_overload.hpp
         - test -f $PREFIX/include/cudf/dictionary/detail/concatenate.hpp
@@ -148,10 +149,12 @@ outputs:
         - test -f $PREFIX/include/cudf/io/json.hpp
         - test -f $PREFIX/include/cudf/io/orc.hpp
         - test -f $PREFIX/include/cudf/io/orc_metadata.hpp
+        - test -f $PREFIX/include/cudf/io/orc_types.hpp
         - test -f $PREFIX/include/cudf/io/parquet.hpp
         - test -f $PREFIX/include/cudf/io/text/byte_range_info.hpp
         - test -f $PREFIX/include/cudf/io/text/data_chunk_source.hpp
         - test -f $PREFIX/include/cudf/io/text/data_chunk_source_factories.hpp
+        - test -f $PREFIX/include/cudf/io/text/detail/bgzip_utils.hpp
         - test -f $PREFIX/include/cudf/io/text/detail/multistate.hpp
         - test -f $PREFIX/include/cudf/io/text/detail/tile_state.hpp
         - test -f $PREFIX/include/cudf/io/text/detail/trie.hpp
@@ -232,6 +235,7 @@ outputs:
         - test -f $PREFIX/include/cudf/strings/json.hpp
         - test -f $PREFIX/include/cudf/strings/padding.hpp
         - test -f $PREFIX/include/cudf/strings/regex/flags.hpp
+        - test -f $PREFIX/include/cudf/strings/regex/regex_program.hpp
         - test -f $PREFIX/include/cudf/strings/repeat_strings.hpp
         - test -f $PREFIX/include/cudf/strings/replace.hpp
         - test -f $PREFIX/include/cudf/strings/replace_re.hpp
@@ -250,7 +254,7 @@ outputs:
         - test -f $PREFIX/include/cudf/structs/structs_column_view.hpp
         - test -f $PREFIX/include/cudf/table/table.hpp
         - test -f $PREFIX/include/cudf/table/table_view.hpp
-        - test -f $PREFIX/include/cudf/tdigest/tdigest_column_view.cuh
+        - test -f $PREFIX/include/cudf/tdigest/tdigest_column_view.hpp
         - test -f $PREFIX/include/cudf/transform.hpp
         - test -f $PREFIX/include/cudf/transpose.hpp
         - test -f $PREFIX/include/cudf/types.hpp
@@ -274,6 +278,7 @@ outputs:
         - test -f $PREFIX/include/cudf_test/file_utilities.hpp
         - test -f $PREFIX/include/cudf_test/io_metadata_utilities.hpp
         - test -f $PREFIX/include/cudf_test/iterator_utilities.hpp
+        - test -f $PREFIX/include/cudf_test/stream_checking_resource_adapter.hpp
         - test -f $PREFIX/include/cudf_test/table_utilities.hpp
         - test -f $PREFIX/include/cudf_test/timestamp_utilities.cuh
         - test -f $PREFIX/include/cudf_test/type_list_utilities.hpp
diff --git a/conda/recipes/strings_udf/conda_build_config.yaml b/conda/recipes/strings_udf/conda_build_config.yaml
index 0027a80f1ec..4feac647e8c 100644
--- a/conda/recipes/strings_udf/conda_build_config.yaml
+++ b/conda/recipes/strings_udf/conda_build_config.yaml
@@ -8,7 +8,7 @@ sysroot_version:
   - "2.17"
 
 cmake_version:
-  - ">=3.23.1"
+  - ">=3.23.1,!=3.25.0"
 
 cuda_compiler:
   - nvcc
diff --git a/conda/recipes/strings_udf/meta.yaml b/conda/recipes/strings_udf/meta.yaml
index a736edef24d..027a8a82aae 100644
--- a/conda/recipes/strings_udf/meta.yaml
+++ b/conda/recipes/strings_udf/meta.yaml
@@ -50,7 +50,7 @@ requirements:
     - cudf ={{ version }}
     - {{ pin_compatible('cudatoolkit', max_pin='x', min_pin='x') }}
     - cachetools
-    - ptxcompiler  # [linux64]  # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler
+    - ptxcompiler >=0.7.0  # [linux64]  # CUDA enhanced compatibility. See https://github.com/rapidsai/ptxcompiler
 test:                                   # [linux64]
   requires:                             # [linux64]
     - cudatoolkit {{ cuda_version }}.*  # [linux64]
diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt
index 2bbb9996e58..6f4f42f6842 100644
--- a/cpp/CMakeLists.txt
+++ b/cpp/CMakeLists.txt
@@ -25,7 +25,7 @@ rapids_cuda_init_architectures(CUDF)
 
 project(
   CUDF
-  VERSION 22.10.01
+  VERSION 22.12.00
   LANGUAGES C CXX CUDA
 )
 if(CMAKE_CUDA_COMPILER_ID STREQUAL "NVIDIA" AND CMAKE_CUDA_COMPILER_VERSION VERSION_LESS 11.5)
@@ -48,6 +48,8 @@ option(BUILD_TESTS "Configure CMake to build tests" ON)
 option(BUILD_BENCHMARKS "Configure CMake to build (google & nvbench) benchmarks" OFF)
 option(BUILD_SHARED_LIBS "Build cuDF shared libraries" ON)
 option(JITIFY_USE_CACHE "Use a file cache for JIT compiled kernels" ON)
+option(CUDF_BUILD_TESTUTIL "Whether to build the test utilities contained in libcudf" ON)
+mark_as_advanced(CUDF_BUILD_TESTUTIL)
 option(CUDF_USE_PROPRIETARY_NVCOMP "Download and use NVCOMP with proprietary extensions" ON)
 option(CUDF_USE_ARROW_STATIC "Build and statically link Arrow libraries" OFF)
 option(CUDF_ENABLE_ARROW_ORC "Build the Arrow ORC adapter" OFF)
@@ -60,14 +62,17 @@ option(
          stream to external libraries."
   OFF
 )
-option(DISABLE_DEPRECATION_WARNING "Disable warnings generated from deprecated declarations." OFF)
+option(DISABLE_DEPRECATION_WARNINGS "Disable warnings generated from deprecated declarations." OFF)
 # Option to enable line info in CUDA device compilation to allow introspection when profiling /
 # memchecking
 option(CUDA_ENABLE_LINEINFO
        "Enable the -lineinfo option for nvcc (useful for cuda-memcheck / profiler" OFF
 )
+option(CUDA_WARNINGS_AS_ERRORS "Enable -Werror=all-warnings for all CUDA compilation" ON)
 # cudart can be statically linked or dynamically linked. The python ecosystem wants dynamic linking
 option(CUDA_STATIC_RUNTIME "Statically link the CUDA runtime" OFF)
+option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
+mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
 
 message(VERBOSE "CUDF: Build with NVTX support: ${USE_NVTX}")
 message(VERBOSE "CUDF: Configure CMake to build tests: ${BUILD_TESTS}")
@@ -79,7 +84,7 @@ message(VERBOSE "CUDF: Build and enable S3 filesystem support for Arrow: ${CUDF_
 message(VERBOSE "CUDF: Build with per-thread default stream: ${CUDF_USE_PER_THREAD_DEFAULT_STREAM}")
 message(
   VERBOSE
-  "CUDF: Disable warnings generated from deprecated declarations: ${DISABLE_DEPRECATION_WARNING}"
+  "CUDF: Disable warnings generated from deprecated declarations: ${DISABLE_DEPRECATION_WARNINGS}"
 )
 message(
   VERBOSE
@@ -91,6 +96,12 @@ message(VERBOSE "CUDF: Statically link the CUDA runtime: ${CUDA_STATIC_RUNTIME}"
 rapids_cmake_build_type("Release")
 set(CUDF_BUILD_TESTS ${BUILD_TESTS})
 set(CUDF_BUILD_BENCHMARKS ${BUILD_BENCHMARKS})
+if(BUILD_TESTS AND NOT CUDF_BUILD_TESTUTIL)
+  message(
+    FATAL_ERROR
+      "Tests cannot be built without building cudf test utils. Please set CUDF_BUILD_TESTUTIL=ON or BUILD_TESTS=OFF"
+  )
+endif()
 
 set(CUDF_CXX_FLAGS "")
 set(CUDF_CUDA_FLAGS "")
@@ -124,24 +135,20 @@ rapids_find_package(
 )
 include(cmake/Modules/ConfigureCUDA.cmake) # set other CUDA compilation flags
 
-# ctest cuda memcheck
-find_program(CUDA_SANITIZER compute-sanitizer)
-set(MEMORYCHECK_COMMAND ${CUDA_SANITIZER})
-set(MEMORYCHECK_TYPE CudaSanitizer)
-set(CUDA_SANITIZER_COMMAND_OPTIONS "--tool memcheck")
-
 # ##################################################################################################
 # * dependencies ----------------------------------------------------------------------------------
 
 # find zlib
 rapids_find_package(ZLIB REQUIRED)
 
-# find Threads (needed by cudftestutil)
-rapids_find_package(
-  Threads REQUIRED
-  BUILD_EXPORT_SET cudf-exports
-  INSTALL_EXPORT_SET cudf-exports
-)
+if(CUDF_BUILD_TESTUTIL)
+  # find Threads (needed by cudftestutil)
+  rapids_find_package(
+    Threads REQUIRED
+    BUILD_EXPORT_SET cudf-exports
+    INSTALL_EXPORT_SET cudf-exports
+  )
+endif()
 
 # add third party dependencies using CPM
 rapids_cpm_init()
@@ -163,7 +170,9 @@ rapids_cpm_libcudacxx(BUILD_EXPORT_SET cudf-exports INSTALL_EXPORT_SET cudf-expo
 # find cuCollections Should come after including thrust and libcudacxx
 include(cmake/thirdparty/get_cucollections.cmake)
 # find or install GoogleTest
-include(cmake/thirdparty/get_gtest.cmake)
+if(CUDF_BUILD_TESTUTIL)
+  include(cmake/thirdparty/get_gtest.cmake)
+endif()
 # preprocess jitify-able kernels
 include(cmake/Modules/JitifyPreprocessKernels.cmake)
 # find cuFile
@@ -332,6 +341,7 @@ add_library(
   src/io/json/json_tree.cu
   src/io/json/nested_json_gpu.cu
   src/io/json/reader_impl.cu
+  src/io/json/experimental/byte_range_info.cu
   src/io/json/experimental/read_json.cpp
   src/io/orc/aggregate_orc_metadata.cpp
   src/io/orc/dict_enc.cu
@@ -349,13 +359,17 @@ add_library(
   src/io/parquet/chunk_dict.cu
   src/io/parquet/page_enc.cu
   src/io/parquet/page_hdr.cu
-  src/io/parquet/reader_impl.cu
+  src/io/parquet/reader.cpp
+  src/io/parquet/reader_impl.cpp
+  src/io/parquet/reader_impl_helpers.cpp
+  src/io/parquet/reader_impl_preprocess.cu
   src/io/parquet/writer_impl.cu
   src/io/statistics/orc_column_statistics.cu
   src/io/statistics/parquet_column_statistics.cu
   src/io/text/byte_range_info.cpp
   src/io/text/data_chunk_source_factories.cpp
   src/io/text/bgzip_data_chunk_source.cu
+  src/io/text/bgzip_utils.cpp
   src/io/text/multibyte_split.cu
   src/io/utilities/column_buffer.cpp
   src/io/utilities/config_utils.cpp
@@ -503,7 +517,8 @@ add_library(
   src/strings/padding.cu
   src/strings/json/json_path.cu
   src/strings/regex/regcomp.cpp
-  src/strings/regex/regexec.cu
+  src/strings/regex/regexec.cpp
+  src/strings/regex/regex_program.cpp
   src/strings/repeat_strings.cu
   src/strings/replace/backref_re.cu
   src/strings/replace/multi_re.cu
@@ -616,9 +631,15 @@ target_compile_definitions(
 # Disable Jitify log printing. See https://github.com/NVIDIA/jitify/issues/79
 target_compile_definitions(cudf PRIVATE "JITIFY_PRINT_LOG=0")
 
-# Instruct jitify to use the kernel JIT cache
 if(JITIFY_USE_CACHE)
-  target_compile_definitions(cudf PUBLIC JITIFY_USE_CACHE "CUDF_VERSION=${PROJECT_VERSION}")
+  # Instruct src/jit/cache what version of cudf we are building so it can compute a cal-ver cache
+  # directory. We isolate this definition to the single source so it doesn't effect compiling
+  # caching for all of libcudf
+  set_property(
+    SOURCE src/jit/cache.cpp
+    APPEND
+    PROPERTY COMPILE_DEFINITIONS "JITIFY_USE_CACHE" "CUDF_VERSION=${PROJECT_VERSION}"
+  )
 endif()
 
 # Per-thread default stream
@@ -685,53 +706,61 @@ add_library(cudf::cudf ALIAS cudf)
 # ##################################################################################################
 # * build cudftestutil ----------------------------------------------------------------------------
 
-add_library(
-  cudftestutil STATIC
-  tests/io/metadata_utilities.cpp
-  tests/quantiles/tdigest_utilities.cu
-  tests/utilities/base_fixture.cpp
-  tests/utilities/column_utilities.cu
-  tests/utilities/table_utilities.cu
-  tests/strings/utilities.cpp
-)
+if(CUDF_BUILD_TESTUTIL)
+  add_library(
+    cudftestutil STATIC
+    tests/io/metadata_utilities.cpp
+    tests/utilities/base_fixture.cpp
+    tests/utilities/column_utilities.cu
+    tests/utilities/table_utilities.cu
+    tests/utilities/tdigest_utilities.cu
+  )
 
-set_target_properties(
-  cudftestutil
-  PROPERTIES BUILD_RPATH "\$ORIGIN"
-             INSTALL_RPATH "\$ORIGIN"
-             # set target compile options
-             CXX_STANDARD 17
-             CXX_STANDARD_REQUIRED ON
-             CUDA_STANDARD 17
-             CUDA_STANDARD_REQUIRED ON
-             POSITION_INDEPENDENT_CODE ON
-             INTERFACE_POSITION_INDEPENDENT_CODE ON
-)
+  set_target_properties(
+    cudftestutil
+    PROPERTIES BUILD_RPATH "\$ORIGIN"
+               INSTALL_RPATH "\$ORIGIN"
+               # set target compile options
+               CXX_STANDARD 17
+               CXX_STANDARD_REQUIRED ON
+               CUDA_STANDARD 17
+               CUDA_STANDARD_REQUIRED ON
+               POSITION_INDEPENDENT_CODE ON
+               INTERFACE_POSITION_INDEPENDENT_CODE ON
+  )
 
-target_compile_options(
-  cudftestutil PUBLIC "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
-                      "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
-)
+  target_compile_options(
+    cudftestutil PUBLIC "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CXX>:${CUDF_CXX_FLAGS}>>"
+                        "$<BUILD_INTERFACE:$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>>"
+  )
 
-target_link_libraries(
-  cudftestutil
-  PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf
-  PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
-)
+  target_link_libraries(
+    cudftestutil
+    PUBLIC GTest::gmock GTest::gtest Threads::Threads cudf
+    PRIVATE $<TARGET_NAME_IF_EXISTS:conda_env>
+  )
 
-target_include_directories(
-  cudftestutil PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
-                      "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
-)
+  target_include_directories(
+    cudftestutil PUBLIC "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}>"
+                        "$<BUILD_INTERFACE:${CUDF_SOURCE_DIR}/src>"
+  )
 
-add_library(cudf::cudftestutil ALIAS cudftestutil)
+  add_library(cudf::cudftestutil ALIAS cudftestutil)
 
+endif()
 # ##################################################################################################
 # * add tests -------------------------------------------------------------------------------------
 
 if(CUDF_BUILD_TESTS)
   # include CTest module -- automatically calls enable_testing()
   include(CTest)
+
+  # ctest cuda memcheck
+  find_program(CUDA_SANITIZER compute-sanitizer)
+  set(MEMORYCHECK_COMMAND ${CUDA_SANITIZER})
+  set(MEMORYCHECK_TYPE CudaSanitizer)
+  set(CUDA_SANITIZER_COMMAND_OPTIONS "--tool memcheck")
+
   # Always print verbose output when tests fail if run using `make test`.
   list(APPEND CMAKE_CTEST_ARGUMENTS "--output-on-failure")
   add_subdirectory(tests)
@@ -742,13 +771,8 @@ endif()
 
 if(CUDF_BUILD_BENCHMARKS)
   # Find or install GoogleBench
-  rapids_cpm_find(
-    benchmark 1.5.2
-    GIT_REPOSITORY https://github.com/google/benchmark.git
-    GIT_TAG v1.5.2
-    GIT_SHALLOW TRUE
-    OPTIONS "BENCHMARK_ENABLE_TESTING OFF" "BENCHMARK_ENABLE_INSTALL OFF"
-  )
+  include(${rapids-cmake-dir}/cpm/gbench.cmake)
+  rapids_cpm_gbench()
 
   # Find or install NVBench Temporarily force downloading of fmt because current versions of nvbench
   # do not support the latest version of fmt, which is automatically pulled into our conda
@@ -759,11 +783,6 @@ if(CUDF_BUILD_BENCHMARKS)
   add_subdirectory(benchmarks)
 endif()
 
-# build pretty-printer load script
-if(Thrust_SOURCE_DIR AND rmm_SOURCE_DIR)
-  configure_file(scripts/load-pretty-printers.in load-pretty-printers @ONLY)
-endif()
-
 # ##################################################################################################
 # * install targets -------------------------------------------------------------------------------
 rapids_cmake_install_lib_dir(lib_dir)
@@ -783,24 +802,26 @@ install(DIRECTORY ${CUDF_SOURCE_DIR}/include/cudf ${CUDF_SOURCE_DIR}/include/cud
                   ${CUDF_SOURCE_DIR}/include/nvtext DESTINATION ${CMAKE_INSTALL_INCLUDEDIR}
 )
 
-install(
-  TARGETS cudftestutil
-  DESTINATION ${lib_dir}
-  EXPORT cudf-testing-exports
-)
+if(CUDF_BUILD_TESTUTIL)
+  install(
+    TARGETS cudftestutil
+    DESTINATION ${lib_dir}
+    EXPORT cudf-testing-exports
+  )
 
-install(
-  EXPORT cudf-testing-exports
-  FILE cudf-testing-targets.cmake
-  NAMESPACE cudf::
-  DESTINATION "${lib_dir}/cmake/cudf"
-)
+  install(
+    EXPORT cudf-testing-exports
+    FILE cudf-testing-targets.cmake
+    NAMESPACE cudf::
+    DESTINATION "${lib_dir}/cmake/cudf"
+  )
 
-include("${rapids-cmake-dir}/export/write_dependencies.cmake")
-rapids_export_write_dependencies(
-  INSTALL cudf-testing-exports
-  "${PROJECT_BINARY_DIR}/rapids-cmake/cudf/export/cudf-testing-dependencies.cmake"
-)
+  include("${rapids-cmake-dir}/export/write_dependencies.cmake")
+  rapids_export_write_dependencies(
+    INSTALL cudf-testing-exports
+    "${PROJECT_BINARY_DIR}/rapids-cmake/cudf/export/cudf-testing-dependencies.cmake"
+  )
+endif()
 
 set(doc_string
     [=[
@@ -890,6 +911,7 @@ if(EXISTS "${CMAKE_CURRENT_LIST_DIR}/cudf-testing-targets.cmake")
 endif()
 ]=]
 )
+
 string(APPEND build_code_string "${common_code_string}")
 
 rapids_export(
@@ -901,15 +923,16 @@ rapids_export(
   FINAL_CODE_BLOCK build_code_string
 )
 
-export(
-  EXPORT cudf-testing-exports
-  FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake
-  NAMESPACE cudf::
-)
-rapids_export_write_dependencies(
-  BUILD cudf-testing-exports "${CUDF_BINARY_DIR}/cudf-testing-dependencies.cmake"
-)
-
+if(CUDF_BUILD_TESTUTIL)
+  export(
+    EXPORT cudf-testing-exports
+    FILE ${CUDF_BINARY_DIR}/cudf-testing-targets.cmake
+    NAMESPACE cudf::
+  )
+  rapids_export_write_dependencies(
+    BUILD cudf-testing-exports "${CUDF_BINARY_DIR}/cudf-testing-dependencies.cmake"
+  )
+endif()
 # ##################################################################################################
 # * make documentation ----------------------------------------------------------------------------
 
@@ -927,3 +950,11 @@ add_custom_target(
   DEPENDS CUDF_DOXYGEN
   COMMENT "Custom command for building cudf doxygen docs."
 )
+
+# ##################################################################################################
+# * make gdb helper scripts ------------------------------------------------------------------------
+
+# build pretty-printer load script
+if(Thrust_SOURCE_DIR AND rmm_SOURCE_DIR)
+  configure_file(scripts/load-pretty-printers.in load-pretty-printers @ONLY)
+endif()
diff --git a/cpp/benchmarks/CMakeLists.txt b/cpp/benchmarks/CMakeLists.txt
index d1ff177a25e..48c9ba5f185 100644
--- a/cpp/benchmarks/CMakeLists.txt
+++ b/cpp/benchmarks/CMakeLists.txt
@@ -169,7 +169,7 @@ ConfigureNVBench(SEARCH_NVBENCH search/contains.cpp)
 # ##################################################################################################
 # * sort benchmark --------------------------------------------------------------------------------
 ConfigureBench(SORT_BENCH sort/rank.cpp sort/sort.cpp sort/sort_strings.cpp)
-ConfigureNVBench(SORT_NVBENCH sort/sort_lists.cpp sort/sort_structs.cpp)
+ConfigureNVBench(SORT_NVBENCH sort/segmented_sort.cpp sort/sort_lists.cpp sort/sort_structs.cpp)
 
 # ##################################################################################################
 # * quantiles benchmark
@@ -301,7 +301,8 @@ ConfigureNVBench(NESTED_JSON_NVBENCH io/json/nested_json.cpp)
 
 # ##################################################################################################
 # * io benchmark ---------------------------------------------------------------------
-ConfigureNVBench(MULTIBYTE_SPLIT_BENCHMARK io/text/multibyte_split.cpp)
+ConfigureNVBench(MULTIBYTE_SPLIT_NVBENCH io/text/multibyte_split.cpp)
+target_link_libraries(MULTIBYTE_SPLIT_NVBENCH PRIVATE ZLIB::ZLIB)
 
 add_custom_target(
   run_benchmarks
diff --git a/cpp/benchmarks/column/concatenate.cpp b/cpp/benchmarks/column/concatenate.cpp
index 99aa414fae3..3260159b409 100644
--- a/cpp/benchmarks/column/concatenate.cpp
+++ b/cpp/benchmarks/column/concatenate.cpp
@@ -49,7 +49,7 @@ static void BM_concatenate(benchmark::State& state)
   CUDF_CHECK_CUDA(0);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     auto result = cudf::concatenate(column_views);
   }
 
@@ -91,7 +91,7 @@ static void BM_concatenate_tables(benchmark::State& state)
   CUDF_CHECK_CUDA(0);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     auto result = cudf::concatenate(table_views);
   }
 
@@ -150,7 +150,7 @@ static void BM_concatenate_strings(benchmark::State& state)
   CUDF_CHECK_CUDA(0);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     auto result = cudf::concatenate(column_views);
   }
 
diff --git a/cpp/benchmarks/common/generate_input.cu b/cpp/benchmarks/common/generate_input.cu
index 890a78bb9bf..dee7e2b8586 100644
--- a/cpp/benchmarks/common/generate_input.cu
+++ b/cpp/benchmarks/common/generate_input.cu
@@ -207,7 +207,7 @@ struct random_value_fn<T, std::enable_if_t<cudf::is_chrono<T>()>> {
     } else {
       // Don't need a random seconds generator for sub-second intervals
       seconds_gen = [range_s](thrust::minstd_rand&, size_t size) {
-        rmm::device_uvector<int64_t> result(size, cudf::default_stream_value);
+        rmm::device_uvector<int64_t> result(size, cudf::get_default_stream());
         thrust::fill(thrust::device, result.begin(), result.end(), range_s.second.count());
         return result;
       };
@@ -225,7 +225,7 @@ struct random_value_fn<T, std::enable_if_t<cudf::is_chrono<T>()>> {
   {
     auto const sec = seconds_gen(engine, size);
     auto const ns  = nanoseconds_gen(engine, size);
-    rmm::device_uvector<T> result(size, cudf::default_stream_value);
+    rmm::device_uvector<T> result(size, cudf::get_default_stream());
     thrust::transform(
       thrust::device,
       sec.begin(),
@@ -247,40 +247,33 @@ struct random_value_fn<T, std::enable_if_t<cudf::is_chrono<T>()>> {
  */
 template <typename T>
 struct random_value_fn<T, std::enable_if_t<cudf::is_fixed_point<T>()>> {
-  using rep = typename T::rep;
-  rep const lower_bound;
-  rep const upper_bound;
-  distribution_fn<rep> dist;
+  using DeviceType = cudf::device_storage_type_t<T>;
+  DeviceType const lower_bound;
+  DeviceType const upper_bound;
+  distribution_fn<DeviceType> dist;
   std::optional<numeric::scale_type> scale;
 
-  random_value_fn(distribution_params<rep> const& desc)
+  random_value_fn(distribution_params<DeviceType> const& desc)
     : lower_bound{desc.lower_bound},
       upper_bound{desc.upper_bound},
-      dist{make_distribution<rep>(desc.id, desc.lower_bound, desc.upper_bound)}
+      dist{make_distribution<DeviceType>(desc.id, desc.lower_bound, desc.upper_bound)}
   {
   }
 
-  rmm::device_uvector<T> operator()(thrust::minstd_rand& engine, unsigned size)
+  [[nodiscard]] numeric::scale_type get_scale(thrust::minstd_rand& engine)
   {
     if (not scale.has_value()) {
-      int const max_scale = std::numeric_limits<rep>::digits10;
+      constexpr int max_scale = std::numeric_limits<DeviceType>::digits10;
       std::uniform_int_distribution<int> scale_dist{-max_scale, max_scale};
       std::mt19937 engine_scale(engine());
       scale = numeric::scale_type{scale_dist(engine_scale)};
     }
-    auto const ints = dist(engine, size);
-    rmm::device_uvector<T> result(size, cudf::default_stream_value);
-    // Clamp the generated random value to the specified range
-    thrust::transform(thrust::device,
-                      ints.begin(),
-                      ints.end(),
-                      result.begin(),
-                      [scale       = *(this->scale),
-                       upper_bound = this->upper_bound,
-                       lower_bound = this->lower_bound] __device__(auto int_value) {
-                        return T{std::clamp(int_value, lower_bound, upper_bound), scale};
-                      });
-    return result;
+    return scale.value_or(numeric::scale_type{0});
+  }
+
+  rmm::device_uvector<DeviceType> operator()(thrust::minstd_rand& engine, unsigned size)
+  {
+    return dist(engine, size);
   }
 };
 
@@ -314,7 +307,7 @@ struct random_value_fn<T, typename std::enable_if_t<std::is_same_v<T, bool>>> {
   random_value_fn(distribution_params<bool> const& desc)
     : dist{[valid_prob = desc.probability_true](thrust::minstd_rand& engine,
                                                 size_t size) -> rmm::device_uvector<bool> {
-        rmm::device_uvector<bool> result(size, cudf::default_stream_value);
+        rmm::device_uvector<bool> result(size, cudf::get_default_stream());
         thrust::tabulate(
           thrust::device, result.begin(), result.end(), bool_generator(engine, valid_prob));
         return result;
@@ -366,7 +359,7 @@ rmm::device_uvector<cudf::size_type> sample_indices_with_run_length(cudf::size_t
         return samples_indices[sample_idx];
       });
     rmm::device_uvector<cudf::size_type> repeated_sample_indices(num_rows,
-                                                                 cudf::default_stream_value);
+                                                                 cudf::get_default_stream());
     thrust::copy(thrust::device,
                  avg_repeated_sample_indices_iterator,
                  avg_repeated_sample_indices_iterator + num_rows,
@@ -398,10 +391,18 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
     distribution_params<bool>{1. - profile.get_null_probability().value_or(0)});
   auto value_dist = random_value_fn<T>{profile.get_distribution_params<T>()};
 
+  using DeviceType            = cudf::device_storage_type_t<T>;
+  cudf::data_type const dtype = [&]() {
+    if constexpr (cudf::is_fixed_point<T>())
+      return cudf::data_type{cudf::type_to_id<T>(), value_dist.get_scale(engine)};
+    else
+      return cudf::data_type{cudf::type_to_id<T>()};
+  }();
+
   // Distribution for picking elements from the array of samples
   auto const avg_run_len = profile.get_avg_run_length();
-  rmm::device_uvector<T> data(0, cudf::default_stream_value);
-  rmm::device_uvector<bool> null_mask(0, cudf::default_stream_value);
+  rmm::device_uvector<DeviceType> data(0, cudf::get_default_stream());
+  rmm::device_uvector<bool> null_mask(0, cudf::get_default_stream());
 
   if (profile.get_cardinality() == 0 and avg_run_len == 1) {
     data      = value_dist(engine, num_rows);
@@ -412,12 +413,13 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
                                                                           : profile_cardinality;
     }();
     rmm::device_uvector<bool> samples_null_mask = valid_dist(engine, cardinality);
-    rmm::device_uvector<T> samples              = value_dist(engine, cardinality);
+    rmm::device_uvector<DeviceType> samples     = value_dist(engine, cardinality);
+
     // generate n samples and gather.
     auto const sample_indices =
       sample_indices_with_run_length(avg_run_len, cardinality, num_rows, engine);
-    data      = rmm::device_uvector<T>(num_rows, cudf::default_stream_value);
-    null_mask = rmm::device_uvector<bool>(num_rows, cudf::default_stream_value);
+    data      = rmm::device_uvector<DeviceType>(num_rows, cudf::get_default_stream());
+    null_mask = rmm::device_uvector<bool>(num_rows, cudf::get_default_stream());
     thrust::gather(
       thrust::device, sample_indices.begin(), sample_indices.end(), samples.begin(), data.begin());
     thrust::gather(thrust::device,
@@ -427,11 +429,11 @@ std::unique_ptr<cudf::column> create_random_column(data_profile const& profile,
                    null_mask.begin());
   }
 
-  auto [result_bitmask, null_count] =
-    cudf::detail::valid_if(null_mask.begin(), null_mask.end(), thrust::identity<bool>{});
+  auto [result_bitmask, null_count] = cudf::detail::valid_if(
+    null_mask.begin(), null_mask.end(), thrust::identity<bool>{}, cudf::get_default_stream());
 
   return std::make_unique<cudf::column>(
-    cudf::data_type{cudf::type_to_id<T>()},
+    dtype,
     num_rows,
     data.release(),
     profile.get_null_probability().has_value() ? std::move(result_bitmask) : rmm::device_buffer{});
@@ -496,18 +498,18 @@ std::unique_ptr<cudf::column> create_random_utf8_string_column(data_profile cons
   auto valid_lengths = thrust::make_transform_iterator(
     thrust::make_zip_iterator(thrust::make_tuple(lengths.begin(), null_mask.begin())),
     valid_or_zero{});
-  rmm::device_uvector<cudf::size_type> offsets(num_rows + 1, cudf::default_stream_value);
+  rmm::device_uvector<cudf::size_type> offsets(num_rows + 1, cudf::get_default_stream());
   thrust::exclusive_scan(
     thrust::device, valid_lengths, valid_lengths + lengths.size(), offsets.begin());
   // offfsets are ready.
   auto chars_length = *thrust::device_pointer_cast(offsets.end() - 1);
-  rmm::device_uvector<char> chars(chars_length, cudf::default_stream_value);
+  rmm::device_uvector<char> chars(chars_length, cudf::get_default_stream());
   thrust::for_each_n(thrust::device,
                      thrust::make_zip_iterator(offsets.begin(), offsets.begin() + 1),
                      num_rows,
                      string_generator{chars.data(), engine});
-  auto [result_bitmask, null_count] =
-    cudf::detail::valid_if(null_mask.begin(), null_mask.end() - 1, thrust::identity<bool>{});
+  auto [result_bitmask, null_count] = cudf::detail::valid_if(
+    null_mask.begin(), null_mask.end() - 1, thrust::identity<bool>{}, cudf::get_default_stream());
   return cudf::make_strings_column(
     num_rows,
     std::move(offsets),
@@ -539,7 +541,8 @@ std::unique_ptr<cudf::column> create_random_column<cudf::string_view>(data_profi
   auto str_table      = cudf::detail::gather(cudf::table_view{{sample_strings->view()}},
                                         sample_indices,
                                         cudf::out_of_bounds_policy::DONT_CHECK,
-                                        cudf::detail::negative_index_policy::NOT_ALLOWED);
+                                        cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                        cudf::get_default_stream());
   return std::move(str_table->release()[0]);
 }
 
@@ -623,7 +626,8 @@ std::unique_ptr<cudf::column> create_random_column<cudf::struct_view>(data_profi
       auto [null_mask, null_count] = [&]() {
         if (profile.get_null_probability().has_value()) {
           auto valids = valid_dist(engine, num_rows);
-          return cudf::detail::valid_if(valids.begin(), valids.end(), thrust::identity<bool>{});
+          return cudf::detail::valid_if(
+            valids.begin(), valids.end(), thrust::identity<bool>{}, cudf::get_default_stream());
         }
         return std::pair<rmm::device_buffer, cudf::size_type>{};
       }();
@@ -706,8 +710,8 @@ std::unique_ptr<cudf::column> create_random_column<cudf::list_view>(data_profile
     auto offsets_column = std::make_unique<cudf::column>(
       cudf::data_type{cudf::type_id::INT32}, num_rows + 1, offsets.release());
 
-    auto [null_mask, null_count] =
-      cudf::detail::valid_if(valids.begin(), valids.end(), thrust::identity<bool>{});
+    auto [null_mask, null_count] = cudf::detail::valid_if(
+      valids.begin(), valids.end(), thrust::identity<bool>{}, cudf::get_default_stream());
     list_column = cudf::make_lists_column(
       num_rows,
       std::move(offsets_column),
@@ -833,7 +837,8 @@ std::pair<rmm::device_buffer, cudf::size_type> create_random_null_mask(
   } else {
     return cudf::detail::valid_if(thrust::make_counting_iterator<cudf::size_type>(0),
                                   thrust::make_counting_iterator<cudf::size_type>(size),
-                                  bool_generator{seed, 1.0 - *null_probability});
+                                  bool_generator{seed, 1.0 - *null_probability},
+                                  cudf::get_default_stream());
   }
 }
 
diff --git a/cpp/benchmarks/common/random_distribution_factory.cuh b/cpp/benchmarks/common/random_distribution_factory.cuh
index 3cfab858793..36b968c6010 100644
--- a/cpp/benchmarks/common/random_distribution_factory.cuh
+++ b/cpp/benchmarks/common/random_distribution_factory.cuh
@@ -148,7 +148,7 @@ distribution_fn<T> make_distribution(distribution_id dist_id, T lower_bound, T u
     case distribution_id::NORMAL:
       return [lower_bound, upper_bound, dist = make_normal_dist(lower_bound, upper_bound)](
                thrust::minstd_rand& engine, size_t size) -> rmm::device_uvector<T> {
-        rmm::device_uvector<T> result(size, cudf::default_stream_value);
+        rmm::device_uvector<T> result(size, cudf::get_default_stream());
         thrust::tabulate(thrust::device,
                          result.begin(),
                          result.end(),
@@ -158,7 +158,7 @@ distribution_fn<T> make_distribution(distribution_id dist_id, T lower_bound, T u
     case distribution_id::UNIFORM:
       return [lower_bound, upper_bound, dist = make_uniform_dist(lower_bound, upper_bound)](
                thrust::minstd_rand& engine, size_t size) -> rmm::device_uvector<T> {
-        rmm::device_uvector<T> result(size, cudf::default_stream_value);
+        rmm::device_uvector<T> result(size, cudf::get_default_stream());
         thrust::tabulate(thrust::device,
                          result.begin(),
                          result.end(),
@@ -169,7 +169,7 @@ distribution_fn<T> make_distribution(distribution_id dist_id, T lower_bound, T u
       // kind of exponential distribution from lower_bound to upper_bound.
       return [lower_bound, upper_bound, dist = geometric_distribution<T>(lower_bound, upper_bound)](
                thrust::minstd_rand& engine, size_t size) -> rmm::device_uvector<T> {
-        rmm::device_uvector<T> result(size, cudf::default_stream_value);
+        rmm::device_uvector<T> result(size, cudf::get_default_stream());
         thrust::tabulate(thrust::device,
                          result.begin(),
                          result.end(),
diff --git a/cpp/benchmarks/copying/copy_if_else.cpp b/cpp/benchmarks/copying/copy_if_else.cpp
index 82f4e15ecb0..9a153a7094c 100644
--- a/cpp/benchmarks/copying/copy_if_else.cpp
+++ b/cpp/benchmarks/copying/copy_if_else.cpp
@@ -45,7 +45,7 @@ static void BM_copy_if_else(benchmark::State& state, bool nulls)
   cudf::column_view lhs(input->view().column(0));
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::copy_if_else(lhs, rhs, decision);
   }
 }
diff --git a/cpp/benchmarks/copying/shift.cu b/cpp/benchmarks/copying/shift.cu
index a849b7da58b..957313134b3 100644
--- a/cpp/benchmarks/copying/shift.cu
+++ b/cpp/benchmarks/copying/shift.cu
@@ -24,7 +24,7 @@
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   T value                             = 0,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);
diff --git a/cpp/benchmarks/groupby/group_max.cpp b/cpp/benchmarks/groupby/group_max.cpp
index 8454d1afee6..4956cce0daf 100644
--- a/cpp/benchmarks/groupby/group_max.cpp
+++ b/cpp/benchmarks/groupby/group_max.cpp
@@ -52,7 +52,7 @@ void bench_groupby_max(nvbench::state& state, nvbench::type_list<Type>)
   requests[0].values = vals->view();
   requests[0].aggregations.push_back(cudf::make_max_aggregation<cudf::groupby_aggregation>());
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
 }
diff --git a/cpp/benchmarks/groupby/group_nunique.cpp b/cpp/benchmarks/groupby/group_nunique.cpp
index 1f95b5d5899..05698c04058 100644
--- a/cpp/benchmarks/groupby/group_nunique.cpp
+++ b/cpp/benchmarks/groupby/group_nunique.cpp
@@ -65,7 +65,7 @@ void bench_groupby_nunique(nvbench::state& state, nvbench::type_list<Type>)
   auto const requests = make_aggregation_request_vector(
     *vals, cudf::make_nunique_aggregation<cudf::groupby_aggregation>());
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync,
              [&](nvbench::launch& launch) { auto const result = gb_obj.aggregate(requests); });
 }
diff --git a/cpp/benchmarks/groupby/group_struct_keys.cpp b/cpp/benchmarks/groupby/group_struct_keys.cpp
index 227a4d5259a..cc6f0faaf41 100644
--- a/cpp/benchmarks/groupby/group_struct_keys.cpp
+++ b/cpp/benchmarks/groupby/group_struct_keys.cpp
@@ -83,7 +83,7 @@ void bench_groupby_struct_keys(nvbench::state& state)
   requests[0].aggregations.push_back(cudf::make_min_aggregation<cudf::groupby_aggregation>());
 
   // Set up nvbench default stream
-  auto stream = cudf::default_stream_value;
+  auto stream = cudf::get_default_stream();
   state.set_cuda_stream(nvbench::make_cuda_stream_view(stream.value()));
 
   state.exec(nvbench::exec_tag::sync,
diff --git a/cpp/benchmarks/hashing/hash.cpp b/cpp/benchmarks/hashing/hash.cpp
index e997bf296c5..1053c2e4694 100644
--- a/cpp/benchmarks/hashing/hash.cpp
+++ b/cpp/benchmarks/hashing/hash.cpp
@@ -35,7 +35,7 @@ static void BM_hash(benchmark::State& state, cudf::hash_id hid, contains_nulls h
     data->get_column(0).set_null_mask(rmm::device_buffer{}, 0);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::hash(data->view(), hid);
   }
 }
diff --git a/cpp/benchmarks/io/csv/csv_reader_input.cpp b/cpp/benchmarks/io/csv/csv_reader_input.cpp
index 4f895e13f1b..27fea856332 100644
--- a/cpp/benchmarks/io/csv/csv_reader_input.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_input.cpp
@@ -47,7 +47,7 @@ void csv_read_common(DataType const& data_types,
     cudf::io::csv_reader_options::builder(source_sink.make_source_info());
 
   auto const mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
              [&](nvbench::launch& launch, auto& timer) {
                try_drop_l3_cache();  // Drop L3 cache for accurate measurement
diff --git a/cpp/benchmarks/io/csv/csv_reader_options.cpp b/cpp/benchmarks/io/csv/csv_reader_options.cpp
index b569dc65f3d..04522c16d5c 100644
--- a/cpp/benchmarks/io/csv/csv_reader_options.cpp
+++ b/cpp/benchmarks/io/csv/csv_reader_options.cpp
@@ -66,7 +66,7 @@ void BM_csv_read_varying_options(
   size_t const chunk_size             = source_sink.size() / num_chunks;
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
   auto const mem_stats_logger         = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
              [&](nvbench::launch& launch, auto& timer) {
                try_drop_l3_cache();  // Drop L3 cache for accurate measurement
diff --git a/cpp/benchmarks/io/csv/csv_writer.cpp b/cpp/benchmarks/io/csv/csv_writer.cpp
index d02305cf478..54a86094eb7 100644
--- a/cpp/benchmarks/io/csv/csv_writer.cpp
+++ b/cpp/benchmarks/io/csv/csv_writer.cpp
@@ -21,13 +21,11 @@
 
 #include <cudf/io/csv.hpp>
 
-// to enable, run cmake with -DBUILD_BENCHMARKS=ON
-
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 256 << 20;
 constexpr cudf::size_type num_cols = 64;
 
-namespace cudf_io = cudf::io;
-
 class CsvWrite : public cudf::benchmark {
 };
 
@@ -44,9 +42,9 @@ void BM_csv_write_varying_inout(benchmark::State& state)
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf_io::csv_writer_options options =
-      cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view).include_header(true);
-    cudf_io::write_csv(options);
+    cudf::io::csv_writer_options options =
+      cudf::io::csv_writer_options::builder(source_sink.make_sink_info(), view);
+    cudf::io::write_csv(options);
   }
 
   state.SetBytesProcessed(data_size * state.iterations());
@@ -74,12 +72,11 @@ void BM_csv_write_varying_options(benchmark::State& state)
   auto mem_stats_logger = cudf::memory_stats_logger();
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    cudf_io::csv_writer_options options =
-      cudf_io::csv_writer_options::builder(source_sink.make_sink_info(), view)
-        .include_header(true)
+    cudf::io::csv_writer_options options =
+      cudf::io::csv_writer_options::builder(source_sink.make_sink_info(), view)
         .na_rep(na_per)
         .rows_per_chunk(rows_per_chunk);
-    cudf_io::write_csv(options);
+    cudf::io::write_csv(options);
   }
 
   state.SetBytesProcessed(data_size * state.iterations());
diff --git a/cpp/benchmarks/io/cuio_common.cpp b/cpp/benchmarks/io/cuio_common.cpp
index da64c1bbf3c..1a9c7153644 100644
--- a/cpp/benchmarks/io/cuio_common.cpp
+++ b/cpp/benchmarks/io/cuio_common.cpp
@@ -23,8 +23,6 @@
 
 #include <unistd.h>
 
-namespace cudf_io = cudf::io;
-
 temp_directory const cuio_source_sink_pair::tmpdir{"cudf_gbench"};
 
 std::string random_file_in_dir(std::string const& dir_path)
@@ -43,21 +41,21 @@ cuio_source_sink_pair::cuio_source_sink_pair(io_type type)
 {
 }
 
-cudf_io::source_info cuio_source_sink_pair::make_source_info()
+cudf::io::source_info cuio_source_sink_pair::make_source_info()
 {
   switch (type) {
-    case io_type::FILEPATH: return cudf_io::source_info(file_name);
-    case io_type::HOST_BUFFER: return cudf_io::source_info(buffer.data(), buffer.size());
+    case io_type::FILEPATH: return cudf::io::source_info(file_name);
+    case io_type::HOST_BUFFER: return cudf::io::source_info(buffer.data(), buffer.size());
     default: CUDF_FAIL("invalid input type");
   }
 }
 
-cudf_io::sink_info cuio_source_sink_pair::make_sink_info()
+cudf::io::sink_info cuio_source_sink_pair::make_sink_info()
 {
   switch (type) {
-    case io_type::VOID: return cudf_io::sink_info(&void_sink);
-    case io_type::FILEPATH: return cudf_io::sink_info(file_name);
-    case io_type::HOST_BUFFER: return cudf_io::sink_info(&buffer);
+    case io_type::VOID: return cudf::io::sink_info(&void_sink);
+    case io_type::FILEPATH: return cudf::io::sink_info(file_name);
+    case io_type::HOST_BUFFER: return cudf::io::sink_info(&buffer);
     default: CUDF_FAIL("invalid output type");
   }
 }
diff --git a/cpp/benchmarks/io/json/nested_json.cpp b/cpp/benchmarks/io/json/nested_json.cpp
index bb3e13a3a01..1fe0218bb0f 100644
--- a/cpp/benchmarks/io/json/nested_json.cpp
+++ b/cpp/benchmarks/io/json/nested_json.cpp
@@ -68,16 +68,16 @@ void BM_NESTED_JSON(nvbench::state& state)
   auto const string_size{size_type(state.get_int64("string_size"))};
   auto const default_options = cudf::io::json_reader_options{};
 
-  auto input = make_test_json_data(string_size, cudf::default_stream_value);
+  auto input = make_test_json_data(string_size, cudf::get_default_stream());
   state.add_element_count(input.size());
 
   // Run algorithm
   auto const mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     // Allocate device-side temporary storage & run algorithm
     cudf::io::json::detail::device_parse_nested_json(
-      input, default_options, cudf::default_stream_value);
+      input, default_options, cudf::get_default_stream());
   });
 
   auto const time = state.get_summary("nv/cold/time/gpu/mean").get_float64("value");
diff --git a/cpp/benchmarks/io/orc/orc_reader_input.cpp b/cpp/benchmarks/io/orc/orc_reader_input.cpp
index 46f14cc4874..f1aaf506a60 100644
--- a/cpp/benchmarks/io/orc/orc_reader_input.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_input.cpp
@@ -25,6 +25,8 @@
 
 #include <nvbench/nvbench.cuh>
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 
@@ -38,7 +40,7 @@ void orc_read_common(cudf::io::orc_writer_options const& opts,
     cudf::io::orc_reader_options::builder(source_sink.make_source_info());
 
   auto mem_stats_logger = cudf::memory_stats_logger();  // init stats logger
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
              [&](nvbench::launch& launch, auto& timer) {
                try_drop_l3_cache();
diff --git a/cpp/benchmarks/io/orc/orc_reader_options.cpp b/cpp/benchmarks/io/orc/orc_reader_options.cpp
index da64fdcac3a..1b7d33ccd19 100644
--- a/cpp/benchmarks/io/orc/orc_reader_options.cpp
+++ b/cpp/benchmarks/io/orc/orc_reader_options.cpp
@@ -21,17 +21,27 @@
 #include <benchmarks/io/nvbench_helpers.hpp>
 
 #include <cudf/io/orc.hpp>
+#include <cudf/io/orc_metadata.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <nvbench/nvbench.cuh>
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size = 512 << 20;
+// The number of separate read calls to use when reading files in multiple chunks
+// Each call reads roughly equal amounts of data
+constexpr int32_t chunked_read_num_chunks = 8;
 
 std::vector<std::string> get_col_names(cudf::io::source_info const& source)
 {
-  cudf::io::orc_reader_options const read_options =
-    cudf::io::orc_reader_options::builder(source).num_rows(1);
-  return cudf::io::read_orc(read_options).metadata.column_names;
+  auto const top_lvl_cols = cudf::io::read_orc_metadata(source).schema().root().children();
+  std::vector<std::string> col_names;
+  std::transform(top_lvl_cols.cbegin(),
+                 top_lvl_cols.cend(),
+                 std::back_inserter(col_names),
+                 [](auto const& col_meta) { return col_meta.name(); });
+  return col_names;
 }
 
 template <column_selection ColSelection,
@@ -48,7 +58,7 @@ void BM_orc_read_varying_options(nvbench::state& state,
 {
   cudf::rmm_pool_raii rmm_pool;
 
-  auto constexpr num_chunks = 1;
+  auto const num_chunks = RowSelection == row_selection::ALL ? 1 : chunked_read_num_chunks;
 
   auto const use_index     = UsesIndex == uses_index::YES;
   auto const use_np_dtypes = UsesNumpyDType == uses_numpy_dtype::YES;
@@ -79,11 +89,12 @@ void BM_orc_read_varying_options(nvbench::state& state,
       .use_np_dtypes(use_np_dtypes)
       .timestamp_type(ts_type);
 
-  auto const num_stripes              = data_size / (64 << 20);
+  auto const num_stripes =
+    cudf::io::read_orc_metadata(source_sink.make_source_info()).num_stripes();
   cudf::size_type const chunk_row_cnt = view.num_rows() / num_chunks;
 
   auto mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(
     nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
       try_drop_l3_cache();
@@ -94,14 +105,9 @@ void BM_orc_read_varying_options(nvbench::state& state,
         auto const is_last_chunk = chunk == (num_chunks - 1);
         switch (RowSelection) {
           case row_selection::ALL: break;
-          case row_selection::STRIPES: {
-            auto stripes_to_read = segments_in_chunk(num_stripes, num_chunks, chunk);
-            if (is_last_chunk) {
-              // Need to assume that an additional "overflow" stripe is present
-              stripes_to_read.push_back(num_stripes);
-            }
-            read_options.set_stripes({stripes_to_read});
-          } break;
+          case row_selection::STRIPES:
+            read_options.set_stripes({segments_in_chunk(num_stripes, num_chunks, chunk)});
+            break;
           case row_selection::NROWS:
             read_options.set_skip_rows(chunk * chunk_row_cnt);
             read_options.set_num_rows(chunk_row_cnt);
@@ -129,6 +135,8 @@ using col_selections = nvbench::enum_type_list<column_selection::ALL,
                                                column_selection::ALTERNATE,
                                                column_selection::FIRST_HALF,
                                                column_selection::SECOND_HALF>;
+using row_selections =
+  nvbench::enum_type_list<row_selection::ALL, row_selection::STRIPES, row_selection::NROWS>;
 
 NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
                     NVBENCH_TYPE_AXES(col_selections,
@@ -141,11 +149,22 @@ NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
     {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"})
   .set_min_samples(4);
 
+NVBENCH_BENCH_TYPES(BM_orc_read_varying_options,
+                    NVBENCH_TYPE_AXES(nvbench::enum_type_list<column_selection::ALL>,
+                                      row_selections,
+                                      nvbench::enum_type_list<uses_index::YES>,
+                                      nvbench::enum_type_list<uses_numpy_dtype::YES>,
+                                      nvbench::enum_type_list<cudf::type_id::EMPTY>))
+  .set_name("orc_read_row_selection")
+  .set_type_axes_names(
+    {"column_selection", "row_selection", "uses_index", "uses_numpy_dtype", "timestamp_type"})
+  .set_min_samples(4);
+
 NVBENCH_BENCH_TYPES(
   BM_orc_read_varying_options,
   NVBENCH_TYPE_AXES(
     nvbench::enum_type_list<column_selection::ALL>,
-    nvbench::enum_type_list<row_selection::NROWS>,
+    nvbench::enum_type_list<row_selection::ALL>,
     nvbench::enum_type_list<uses_index::YES, uses_index::NO>,
     nvbench::enum_type_list<uses_numpy_dtype::YES, uses_numpy_dtype::NO>,
     nvbench::enum_type_list<cudf::type_id::EMPTY, cudf::type_id::TIMESTAMP_NANOSECONDS>))
diff --git a/cpp/benchmarks/io/orc/orc_writer.cpp b/cpp/benchmarks/io/orc/orc_writer.cpp
index ddf699b0eaa..545f8d10122 100644
--- a/cpp/benchmarks/io/orc/orc_writer.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer.cpp
@@ -38,6 +38,8 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
   },
   [](auto) { return std::string{}; })
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size        = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 
@@ -61,7 +63,7 @@ void BM_orc_write_encode(nvbench::state& state, nvbench::type_list<nvbench::enum
   std::size_t encoded_file_size = 0;
 
   auto mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
              [&](nvbench::launch& launch, auto& timer) {
                cuio_source_sink_pair source_sink(sink_type);
@@ -112,7 +114,7 @@ void BM_orc_write_io_compression(
   std::size_t encoded_file_size = 0;
 
   auto mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
              [&](nvbench::launch& launch, auto& timer) {
                cuio_source_sink_pair source_sink(sink_type);
@@ -157,7 +159,7 @@ void BM_orc_write_statistics(
   std::size_t encoded_file_size = 0;
 
   auto mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
              [&](nvbench::launch& launch, auto& timer) {
                cuio_source_sink_pair source_sink(io_type::FILEPATH);
diff --git a/cpp/benchmarks/io/orc/orc_writer_chunks.cpp b/cpp/benchmarks/io/orc/orc_writer_chunks.cpp
index daf5e247a02..592eae96362 100644
--- a/cpp/benchmarks/io/orc/orc_writer_chunks.cpp
+++ b/cpp/benchmarks/io/orc/orc_writer_chunks.cpp
@@ -29,8 +29,8 @@
 
 #include <nvbench/nvbench.cuh>
 
-// to enable, run cmake with -DBUILD_BENCHMARKS=ON
-
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size = 512 << 20;
 
 void nvbench_orc_write(nvbench::state& state)
@@ -58,7 +58,7 @@ void nvbench_orc_write(nvbench::state& state)
 
   size_t encoded_file_size = 0;
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
              [&](nvbench::launch& launch, auto& timer) {
                cuio_source_sink_pair source_sink(io_type::VOID);
@@ -112,7 +112,7 @@ void nvbench_orc_chunked_write(nvbench::state& state)
 
   size_t encoded_file_size = 0;
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(
     nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) {
       cuio_source_sink_pair source_sink(io_type::VOID);
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
index 6477f611421..7a4e649d4fb 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_input.cpp
@@ -25,6 +25,8 @@
 
 #include <nvbench/nvbench.cuh>
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 
@@ -38,7 +40,7 @@ void parquet_read_common(cudf::io::parquet_writer_options const& write_opts,
     cudf::io::parquet_reader_options::builder(source_sink.make_source_info());
 
   auto mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync | nvbench::exec_tag::timer,
              [&](nvbench::launch& launch, auto& timer) {
                try_drop_l3_cache();
diff --git a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
index 3c1e41c89b8..b5e4f6d8f2b 100644
--- a/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_reader_options.cpp
@@ -25,6 +25,8 @@
 
 #include <nvbench/nvbench.cuh>
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr std::size_t data_size      = 512 << 20;
 constexpr std::size_t row_group_size = 128 << 20;
 
@@ -86,7 +88,7 @@ void BM_parquet_read_options(nvbench::state& state,
   auto constexpr num_chunks     = 1;
 
   auto mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(
     nvbench::exec_tag::sync | nvbench::exec_tag::timer, [&](nvbench::launch& launch, auto& timer) {
       try_drop_l3_cache();
diff --git a/cpp/benchmarks/io/parquet/parquet_writer.cpp b/cpp/benchmarks/io/parquet/parquet_writer.cpp
index 747dd5c086c..a0b076abfda 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer.cpp
@@ -25,8 +25,6 @@
 
 #include <nvbench/nvbench.cuh>
 
-// to enable, run cmake with -DBUILD_BENCHMARKS=ON
-
 NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
   cudf::io::statistics_freq,
   [](auto value) {
@@ -34,11 +32,14 @@ NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
       case cudf::io::statistics_freq::STATISTICS_NONE: return "STATISTICS_NONE";
       case cudf::io::statistics_freq::STATISTICS_ROWGROUP: return "STATISTICS_ROWGROUP";
       case cudf::io::statistics_freq::STATISTICS_PAGE: return "STATISTICS_PAGE";
+      case cudf::io::statistics_freq::STATISTICS_COLUMN: return "STATISTICS_COLUMN";
       default: return "Unknown";
     }
   },
   [](auto) { return std::string{}; })
 
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr size_t data_size         = 512 << 20;
 constexpr cudf::size_type num_cols = 64;
 
@@ -62,7 +63,7 @@ void BM_parq_write_encode(nvbench::state& state, nvbench::type_list<nvbench::enu
   std::size_t encoded_file_size = 0;
 
   auto const mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
              [&](nvbench::launch& launch, auto& timer) {
                cuio_source_sink_pair source_sink(sink_type);
@@ -114,7 +115,7 @@ void BM_parq_write_io_compression(
   std::size_t encoded_file_size = 0;
 
   auto const mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
              [&](nvbench::launch& launch, auto& timer) {
                cuio_source_sink_pair source_sink(sink_type);
@@ -159,7 +160,7 @@ void BM_parq_write_varying_options(
   std::size_t encoded_file_size = 0;
 
   auto mem_stats_logger = cudf::memory_stats_logger();
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
              [&](nvbench::launch& launch, auto& timer) {
                cuio_source_sink_pair source_sink(io_type::FILEPATH);
@@ -201,6 +202,7 @@ using compression_list =
 
 using stats_list = nvbench::enum_type_list<cudf::io::STATISTICS_NONE,
                                            cudf::io::STATISTICS_ROWGROUP,
+                                           cudf::io::STATISTICS_COLUMN,
                                            cudf::io::STATISTICS_PAGE>;
 
 NVBENCH_BENCH_TYPES(BM_parq_write_encode, NVBENCH_TYPE_AXES(d_type_list))
diff --git a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
index 6c8500a2a70..11b29cc2297 100644
--- a/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
+++ b/cpp/benchmarks/io/parquet/parquet_writer_chunks.cpp
@@ -27,8 +27,8 @@
 
 #include <nvbench/nvbench.cuh>
 
-// to enable, run cmake with -DBUILD_BENCHMARKS=ON
-
+// Size of the data in the the benchmark dataframe; chosen to be low enough to allow benchmarks to
+// run on most GPUs, but large enough to allow highest throughput
 constexpr int64_t data_size = 512 << 20;
 
 void PQ_write(nvbench::state& state)
@@ -44,7 +44,7 @@ void PQ_write(nvbench::state& state)
   std::size_t encoded_file_size = 0;
   auto const mem_stats_logger   = cudf::memory_stats_logger();
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::timer | nvbench::exec_tag::sync,
              [&](nvbench::launch& launch, auto& timer) {
                cuio_source_sink_pair source_sink(io_type::VOID);
@@ -81,7 +81,7 @@ void PQ_write_chunked(nvbench::state& state)
   auto const mem_stats_logger   = cudf::memory_stats_logger();
   std::size_t encoded_file_size = 0;
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(
     nvbench::exec_tag::timer | nvbench::exec_tag::sync, [&](nvbench::launch& launch, auto& timer) {
       cuio_source_sink_pair source_sink(io_type::VOID);
diff --git a/cpp/benchmarks/io/text/multibyte_split.cpp b/cpp/benchmarks/io/text/multibyte_split.cpp
index 4865d11ae8b..75db8e36689 100644
--- a/cpp/benchmarks/io/text/multibyte_split.cpp
+++ b/cpp/benchmarks/io/text/multibyte_split.cpp
@@ -23,8 +23,10 @@
 #include <cudf_test/file_utilities.hpp>
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/utilities/pinned_allocator.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/strings/combine.hpp>
@@ -32,7 +34,6 @@
 #include <cudf/utilities/default_stream.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #include <thrust/transform.h>
 
 #include <nvbench/nvbench.cuh>
@@ -40,10 +41,26 @@
 #include <cstdio>
 #include <fstream>
 #include <memory>
+#include <random>
 
 temp_directory const temp_dir("cudf_nvbench");
 
-enum class data_chunk_source_type { device, file, host, host_pinned };
+enum class data_chunk_source_type { device, file, file_datasource, host, host_pinned, file_bgzip };
+
+NVBENCH_DECLARE_ENUM_TYPE_STRINGS(
+  data_chunk_source_type,
+  [](auto value) {
+    switch (value) {
+      case data_chunk_source_type::device: return "device";
+      case data_chunk_source_type::file: return "file";
+      case data_chunk_source_type::file_datasource: return "file_datasource";
+      case data_chunk_source_type::host: return "host";
+      case data_chunk_source_type::host_pinned: return "host_pinned";
+      case data_chunk_source_type::file_bgzip: return "file_bgzip";
+      default: return "Unknown";
+    }
+  },
+  [](auto) { return std::string{}; })
 
 static cudf::string_scalar create_random_input(int32_t num_chars,
                                                double delim_factor,
@@ -78,15 +95,34 @@ static cudf::string_scalar create_random_input(int32_t num_chars,
   return cudf::string_scalar(std::move(*chars_buffer));
 }
 
-static void bench_multibyte_split(nvbench::state& state)
+static void write_bgzip_file(cudf::host_span<char const> host_data, std::ostream& output_stream)
+{
+  // a bit of variability with a decent amount of padding so we don't overflow 16 bit block sizes
+  std::uniform_int_distribution<std::size_t> chunk_size_dist{64000, 65000};
+  std::default_random_engine rng{};
+  std::size_t pos = 0;
+  while (pos < host_data.size()) {
+    auto const remainder  = host_data.size() - pos;
+    auto const chunk_size = std::min(remainder, chunk_size_dist(rng));
+    cudf::io::text::detail::bgzip::write_compressed_block(output_stream,
+                                                          {host_data.data() + pos, chunk_size});
+    pos += chunk_size;
+  }
+  // empty block denotes EOF
+  cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
+}
+
+template <data_chunk_source_type source_type>
+static void bench_multibyte_split(nvbench::state& state,
+                                  nvbench::type_list<nvbench::enum_type<source_type>>)
 {
   cudf::rmm_pool_raii pool_raii;
 
-  auto const source_type      = static_cast<data_chunk_source_type>(state.get_int64("source_type"));
-  auto const delim_size       = state.get_int64("delim_size");
-  auto const delim_percent    = state.get_int64("delim_percent");
-  auto const file_size_approx = state.get_int64("size_approx");
+  auto const delim_size         = state.get_int64("delim_size");
+  auto const delim_percent      = state.get_int64("delim_percent");
+  auto const file_size_approx   = state.get_int64("size_approx");
   auto const byte_range_percent = state.get_int64("byte_range_percent");
+  auto const strip_delimiters   = bool(state.get_int64("strip_delimiters"));
 
   auto const byte_range_factor = static_cast<double>(byte_range_percent) / 100;
   CUDF_EXPECTS(delim_percent >= 1, "delimiter percent must be at least 1");
@@ -99,15 +135,16 @@ static void bench_multibyte_split(nvbench::state& state)
   std::iota(delim.begin(), delim.end(), '1');
 
   auto const delim_factor = static_cast<double>(delim_percent) / 100;
-  auto device_input       = create_random_input(file_size_approx, delim_factor, 0.05, delim);
-  auto host_input         = std::vector<char>{};
-  auto host_pinned_input =
-    thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>>{};
+  std::unique_ptr<cudf::io::datasource> datasource;
+  auto device_input      = create_random_input(file_size_approx, delim_factor, 0.05, delim);
+  auto host_input        = std::vector<char>{};
+  auto host_pinned_input = thrust::host_vector<char, cudf::detail::pinned_allocator<char>>{};
 
-  if (source_type == data_chunk_source_type::host || source_type == data_chunk_source_type::file) {
+  if (source_type != data_chunk_source_type::device &&
+      source_type != data_chunk_source_type::host_pinned) {
     host_input = cudf::detail::make_std_vector_sync<char>(
       {device_input.data(), static_cast<std::size_t>(device_input.size())},
-      cudf::default_stream_value);
+      cudf::get_default_stream());
   }
   if (source_type == data_chunk_source_type::host_pinned) {
     host_pinned_input.resize(static_cast<std::size_t>(device_input.size()));
@@ -119,11 +156,17 @@ static void bench_multibyte_split(nvbench::state& state)
 
   auto source = [&] {
     switch (source_type) {
-      case data_chunk_source_type::file: {
+      case data_chunk_source_type::file:
+      case data_chunk_source_type::file_datasource: {
         auto const temp_file_name = random_file_in_dir(temp_dir.path());
         std::ofstream(temp_file_name, std::ofstream::out)
           .write(host_input.data(), host_input.size());
-        return cudf::io::text::make_source_from_file(temp_file_name);
+        if (source_type == data_chunk_source_type::file) {
+          return cudf::io::text::make_source_from_file(temp_file_name);
+        } else {
+          datasource = cudf::io::datasource::create(temp_file_name);
+          return cudf::io::text::make_source(*datasource);
+        }
       }
       case data_chunk_source_type::host:  //
         return cudf::io::text::make_source(host_input);
@@ -131,6 +174,14 @@ static void bench_multibyte_split(nvbench::state& state)
         return cudf::io::text::make_source(host_pinned_input);
       case data_chunk_source_type::device:  //
         return cudf::io::text::make_source(device_input);
+      case data_chunk_source_type::file_bgzip: {
+        auto const temp_file_name = random_file_in_dir(temp_dir.path());
+        {
+          std::ofstream output_stream(temp_file_name, std::ofstream::out);
+          write_bgzip_file(host_input, output_stream);
+        }
+        return cudf::io::text::make_source_from_bgzip_file(temp_file_name);
+      }
       default: CUDF_FAIL();
     }
   }();
@@ -139,12 +190,13 @@ static void bench_multibyte_split(nvbench::state& state)
   auto const range_size   = static_cast<int64_t>(device_input.size() * byte_range_factor);
   auto const range_offset = (device_input.size() - range_size) / 2;
   cudf::io::text::byte_range_info range{range_offset, range_size};
+  cudf::io::text::parse_options options{range, strip_delimiters};
   std::unique_ptr<cudf::column> output;
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
     try_drop_l3_cache();
-    output = cudf::io::text::multibyte_split(*source, delim, range);
+    output = cudf::io::text::multibyte_split(*source, delim, options);
   });
 
   state.add_buffer_size(mem_stats_logger.peak_memory_usage(), "pmu", "Peak Memory Usage");
@@ -152,13 +204,16 @@ static void bench_multibyte_split(nvbench::state& state)
   state.add_buffer_size(range_size, "efs", "Encoded file size");
 }
 
-NVBENCH_BENCH(bench_multibyte_split)
+using source_type_list = nvbench::enum_type_list<data_chunk_source_type::device,
+                                                 data_chunk_source_type::file,
+                                                 data_chunk_source_type::file_datasource,
+                                                 data_chunk_source_type::host,
+                                                 data_chunk_source_type::host_pinned,
+                                                 data_chunk_source_type::file_bgzip>;
+
+NVBENCH_BENCH_TYPES(bench_multibyte_split, NVBENCH_TYPE_AXES(source_type_list))
   .set_name("multibyte_split")
-  .add_int64_axis("source_type",
-                  {static_cast<int>(data_chunk_source_type::device),
-                   static_cast<int>(data_chunk_source_type::file),
-                   static_cast<int>(data_chunk_source_type::host),
-                   static_cast<int>(data_chunk_source_type::host_pinned)})
+  .add_int64_axis("strip_delimiters", {0, 1})
   .add_int64_axis("delim_size", {1, 4, 7})
   .add_int64_axis("delim_percent", {1, 25})
   .add_int64_power_of_two_axis("size_approx", {15, 30})
diff --git a/cpp/benchmarks/iterator/iterator.cu b/cpp/benchmarks/iterator/iterator.cu
index c121d070ca0..73060200d00 100644
--- a/cpp/benchmarks/iterator/iterator.cu
+++ b/cpp/benchmarks/iterator/iterator.cu
@@ -56,7 +56,7 @@ inline auto reduce_by_cub(OutputIterator result, InputIterator d_in, int num_ite
     nullptr, temp_storage_bytes, d_in, result, num_items, cudf::DeviceSum{}, init);
 
   // Allocate temporary storage
-  rmm::device_buffer d_temp_storage(temp_storage_bytes, cudf::default_stream_value);
+  rmm::device_buffer d_temp_storage(temp_storage_bytes, cudf::get_default_stream());
 
   // Run reduction
   cub::DeviceReduce::Reduce(
@@ -140,7 +140,8 @@ void BM_iterator(benchmark::State& state)
   cudf::column_view hasnull_F = wrap_hasnull_F;
 
   // Initialize dev_result to false
-  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(1);
+  auto dev_result =
+    cudf::detail::make_zeroed_device_uvector_sync<TypeParam>(1, cudf::get_default_stream());
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
@@ -208,7 +209,8 @@ void BM_pair_iterator(benchmark::State& state)
   cudf::column_view hasnull_T = wrap_hasnull_T;
 
   // Initialize dev_result to false
-  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(1);
+  auto dev_result = cudf::detail::make_zeroed_device_uvector_sync<thrust::pair<T, bool>>(
+    1, cudf::get_default_stream());
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
     if (cub_or_thrust) {
diff --git a/cpp/benchmarks/join/conditional_join.cu b/cpp/benchmarks/join/conditional_join.cu
index 3c4208bf0fc..547367ffb69 100644
--- a/cpp/benchmarks/join/conditional_join.cu
+++ b/cpp/benchmarks/join/conditional_join.cu
@@ -70,7 +70,7 @@ CONDITIONAL_LEFT_JOIN_BENCHMARK_DEFINE(conditional_left_join_64bit_nulls, int64_
                    cudf::table_view const& right,                                      \
                    cudf::ast::operation binary_pred,                                   \
                    cudf::null_equality compare_nulls) {                                \
-      return cudf::conditional_inner_join(left, right, binary_pred);                   \
+      return cudf::conditional_full_join(left, right, binary_pred);                    \
     };                                                                                 \
     constexpr bool is_conditional = true;                                              \
     BM_join<key_type, payload_type, nullable, is_conditional>(st, join);               \
diff --git a/cpp/benchmarks/join/generate_input_tables.cuh b/cpp/benchmarks/join/generate_input_tables.cuh
index 31cef581f22..c606cd8b4c0 100644
--- a/cpp/benchmarks/join/generate_input_tables.cuh
+++ b/cpp/benchmarks/join/generate_input_tables.cuh
@@ -154,7 +154,7 @@ void generate_input_tables(key_type* const build_tbl,
 
   const int num_states =
     num_sms * std::max(num_blocks_init_build_tbl, num_blocks_init_probe_tbl) * block_size;
-  rmm::device_uvector<curandState> devStates(num_states, cudf::default_stream_value);
+  rmm::device_uvector<curandState> devStates(num_states, cudf::get_default_stream());
 
   init_curand<<<(num_states - 1) / block_size + 1, block_size>>>(devStates.data(), num_states);
 
diff --git a/cpp/benchmarks/join/join_common.hpp b/cpp/benchmarks/join/join_common.hpp
index 1a87c2d1158..ad288edb169 100644
--- a/cpp/benchmarks/join/join_common.hpp
+++ b/cpp/benchmarks/join/join_common.hpp
@@ -86,7 +86,9 @@ static void BM_join(state_type& state, Join JoinFunc)
     // roughly 75% nulls
     auto validity =
       thrust::make_transform_iterator(thrust::make_counting_iterator(0), null75_generator{});
-    return cudf::detail::valid_if(validity, validity + size, thrust::identity<bool>{}).first;
+    return cudf::detail::valid_if(
+             validity, validity + size, thrust::identity<bool>{}, cudf::get_default_stream())
+      .first;
   };
 
   std::unique_ptr<cudf::column> build_key_column0 = [&]() {
@@ -142,7 +144,7 @@ static void BM_join(state_type& state, Join JoinFunc)
   // Benchmark the inner join operation
   if constexpr (std::is_same_v<state_type, benchmark::State> and (not is_conditional)) {
     for (auto _ : state) {
-      cuda_event_timer raii(state, true, cudf::default_stream_value);
+      cuda_event_timer raii(state, true, cudf::get_default_stream());
 
       auto result = JoinFunc(probe_table.select(columns_to_join),
                              build_table.select(columns_to_join),
@@ -168,7 +170,7 @@ static void BM_join(state_type& state, Join JoinFunc)
       cudf::ast::operation(cudf::ast::ast_operator::EQUAL, col_ref_left_0, col_ref_right_0);
 
     for (auto _ : state) {
-      cuda_event_timer raii(state, true, cudf::default_stream_value);
+      cuda_event_timer raii(state, true, cudf::get_default_stream());
 
       auto result =
         JoinFunc(probe_table, build_table, left_zero_eq_right_zero, cudf::null_equality::UNEQUAL);
diff --git a/cpp/benchmarks/lists/copying/scatter_lists.cu b/cpp/benchmarks/lists/copying/scatter_lists.cu
index 823693721a0..02ad97fee11 100644
--- a/cpp/benchmarks/lists/copying/scatter_lists.cu
+++ b/cpp/benchmarks/lists/copying/scatter_lists.cu
@@ -40,7 +40,7 @@ class ScatterLists : public cudf::benchmark {
 template <class TypeParam, bool coalesce>
 void BM_lists_scatter(::benchmark::State& state)
 {
-  auto stream = cudf::default_stream_value;
+  auto stream = cudf::get_default_stream();
   auto mr     = rmm::mr::get_current_device_resource();
 
   const size_type base_size{(size_type)state.range(0)};
@@ -108,7 +108,7 @@ void BM_lists_scatter(::benchmark::State& state)
 
   for (auto _ : state) {
     cuda_event_timer raii(state, true);  // flush_l2_cache = true, stream = 0
-    scatter(table_view{{*source}}, *scatter_map, table_view{{*target}}, false, mr);
+    scatter(table_view{{*source}}, *scatter_map, table_view{{*target}}, mr);
   }
 
   state.SetBytesProcessed(static_cast<int64_t>(state.iterations()) * state.range(0) * 2 *
diff --git a/cpp/benchmarks/quantiles/quantiles.cpp b/cpp/benchmarks/quantiles/quantiles.cpp
index 7c0a88584f8..599cff2bcda 100644
--- a/cpp/benchmarks/quantiles/quantiles.cpp
+++ b/cpp/benchmarks/quantiles/quantiles.cpp
@@ -50,7 +50,7 @@ static void BM_quantiles(benchmark::State& state, bool nulls)
     thrust::seq, q.begin(), q.end(), [n_quantiles](auto i) { return i * (1.0f / n_quantiles); });
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
 
     auto result = cudf::quantiles(input, q);
     // auto result = (stable) ? cudf::stable_sorted_order(input) : cudf::sorted_order(input);
diff --git a/cpp/benchmarks/reduction/anyall.cpp b/cpp/benchmarks/reduction/anyall.cpp
index 80a85b0f217..755fa1ca2ad 100644
--- a/cpp/benchmarks/reduction/anyall.cpp
+++ b/cpp/benchmarks/reduction/anyall.cpp
@@ -41,7 +41,7 @@ void BM_reduction_anyall(benchmark::State& state,
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(*values, agg, output_dtype);
+    auto result = cudf::reduce(*values, *agg, output_dtype);
   }
 }
 
diff --git a/cpp/benchmarks/reduction/dictionary.cpp b/cpp/benchmarks/reduction/dictionary.cpp
index 219564d6b5c..8f2f0be33ca 100644
--- a/cpp/benchmarks/reduction/dictionary.cpp
+++ b/cpp/benchmarks/reduction/dictionary.cpp
@@ -51,7 +51,7 @@ void BM_reduction_dictionary(benchmark::State& state,
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(*values, agg, output_dtype);
+    auto result = cudf::reduce(*values, *agg, output_dtype);
   }
 }
 
diff --git a/cpp/benchmarks/reduction/rank.cpp b/cpp/benchmarks/reduction/rank.cpp
index c20f728e018..5022e029d97 100644
--- a/cpp/benchmarks/reduction/rank.cpp
+++ b/cpp/benchmarks/reduction/rank.cpp
@@ -61,4 +61,4 @@ NVBENCH_BENCH_TYPES(nvbench_reduction_scan, NVBENCH_TYPE_AXES(data_type))
                     1000000,    // 1M
                     10000000,   // 10M
                     100000000,  // 100M
-                  });
\ No newline at end of file
+                  });
diff --git a/cpp/benchmarks/reduction/reduce.cpp b/cpp/benchmarks/reduction/reduce.cpp
index 4e354352c11..4dfa7f0bbdc 100644
--- a/cpp/benchmarks/reduction/reduce.cpp
+++ b/cpp/benchmarks/reduction/reduce.cpp
@@ -45,7 +45,7 @@ void BM_reduction(benchmark::State& state, std::unique_ptr<cudf::reduce_aggregat
 
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
-    auto result = cudf::reduce(*input_column, agg, output_dtype);
+    auto result = cudf::reduce(*input_column, *agg, output_dtype);
   }
 }
 
diff --git a/cpp/benchmarks/reduction/scan.cpp b/cpp/benchmarks/reduction/scan.cpp
index 354333ea411..592eed1210a 100644
--- a/cpp/benchmarks/reduction/scan.cpp
+++ b/cpp/benchmarks/reduction/scan.cpp
@@ -38,7 +38,7 @@ static void BM_reduction_scan(benchmark::State& state, bool include_nulls)
   for (auto _ : state) {
     cuda_event_timer timer(state, true);
     auto result = cudf::scan(
-      *column, cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
+      *column, *cudf::make_min_aggregation<cudf::scan_aggregation>(), cudf::scan_type::INCLUSIVE);
   }
 }
 
diff --git a/cpp/benchmarks/reduction/segment_reduce.cu b/cpp/benchmarks/reduction/segment_reduce.cu
index d2c15c87c2b..e063adb25f9 100644
--- a/cpp/benchmarks/reduction/segment_reduce.cu
+++ b/cpp/benchmarks/reduction/segment_reduce.cu
@@ -109,7 +109,7 @@ void BM_Simple_Segmented_Reduction(nvbench::state& state,
   auto const input_view  = input->view();
   auto const offset_span = cudf::device_span<cudf::size_type>{offsets};
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(
     nvbench::exec_tag::sync, [input_view, output_type, offset_span, &agg](nvbench::launch& launch) {
       segmented_reduce(input_view, offset_span, *agg, output_type, cudf::null_policy::INCLUDE);
diff --git a/cpp/benchmarks/sort/rank.cpp b/cpp/benchmarks/sort/rank.cpp
index 66277443800..2c26f4fa15d 100644
--- a/cpp/benchmarks/sort/rank.cpp
+++ b/cpp/benchmarks/sort/rank.cpp
@@ -37,7 +37,7 @@ static void BM_rank(benchmark::State& state, bool nulls)
   auto keys = create_random_column(cudf::type_to_id<Type>(), row_count{n_rows}, profile);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
 
     auto result = cudf::rank(keys->view(),
                              cudf::rank_method::FIRST,
diff --git a/cpp/benchmarks/sort/segmented_sort.cpp b/cpp/benchmarks/sort/segmented_sort.cpp
new file mode 100644
index 00000000000..7162269853c
--- /dev/null
+++ b/cpp/benchmarks/sort/segmented_sort.cpp
@@ -0,0 +1,61 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <benchmarks/common/generate_input.hpp>
+#include <benchmarks/fixture/rmm_pool_raii.hpp>
+
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/sorting.hpp>
+#include <cudf/utilities/default_stream.hpp>
+
+#include <nvbench/nvbench.cuh>
+
+void nvbench_segmented_sort(nvbench::state& state)
+{
+  cudf::rmm_pool_raii pool_raii;
+
+  auto const dtype      = cudf::type_to_id<int32_t>();
+  auto const size_bytes = static_cast<size_t>(state.get_int64("size_bytes"));
+  auto const null_freq  = state.get_float64("null_frequency");
+  auto const row_width  = static_cast<cudf::size_type>(state.get_int64("row_width"));
+
+  data_profile const table_profile =
+    data_profile_builder().null_probability(null_freq).distribution(
+      dtype, distribution_id::UNIFORM, 0, 10);
+  auto const input =
+    create_random_table({cudf::type_id::INT32}, table_size_bytes{size_bytes}, table_profile);
+  auto const rows = input->num_rows();
+
+  auto const segments = cudf::sequence((rows / row_width) + 1,
+                                       cudf::numeric_scalar<int32_t>(0),
+                                       cudf::numeric_scalar<int32_t>(row_width));
+
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
+  state.add_element_count(size_bytes, "bytes");
+  state.add_global_memory_reads<nvbench::int32_t>(rows * row_width);
+  state.add_global_memory_writes<nvbench::int32_t>(rows);
+
+  state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
+    auto result = cudf::segmented_sorted_order(*input, *segments);
+  });
+}
+
+NVBENCH_BENCH(nvbench_segmented_sort)
+  .set_name("segmented_sort")
+  .add_int64_power_of_two_axis("size_bytes", {16, 18, 20, 22, 24, 28})
+  .add_float64_axis("null_frequency", {0, 0.1})
+  .add_int64_axis("row_width", {16, 128, 1024});
diff --git a/cpp/benchmarks/sort/sort.cpp b/cpp/benchmarks/sort/sort.cpp
index 13502ce0959..304bac06632 100644
--- a/cpp/benchmarks/sort/sort.cpp
+++ b/cpp/benchmarks/sort/sort.cpp
@@ -42,7 +42,7 @@ static void BM_sort(benchmark::State& state, bool nulls)
   cudf::table_view input{*input_table};
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
 
     auto result = (stable) ? cudf::stable_sorted_order(input) : cudf::sorted_order(input);
   }
diff --git a/cpp/benchmarks/sort/sort_strings.cpp b/cpp/benchmarks/sort/sort_strings.cpp
index 701b392f80b..572c05d69cb 100644
--- a/cpp/benchmarks/sort/sort_strings.cpp
+++ b/cpp/benchmarks/sort/sort_strings.cpp
@@ -32,7 +32,7 @@ static void BM_sort(benchmark::State& state)
   auto const table = create_random_table({cudf::type_id::STRING}, row_count{n_rows});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::sort(table->view());
   }
 }
diff --git a/cpp/benchmarks/stream_compaction/distinct.cpp b/cpp/benchmarks/stream_compaction/distinct.cpp
index ad837bc4caa..512554ff1bc 100644
--- a/cpp/benchmarks/stream_compaction/distinct.cpp
+++ b/cpp/benchmarks/stream_compaction/distinct.cpp
@@ -18,8 +18,8 @@
 #include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/stream_compaction.hpp>
 #include <cudf/lists/list_view.hpp>
+#include <cudf/stream_compaction.hpp>
 #include <cudf/types.hpp>
 
 #include <nvbench/nvbench.cuh>
@@ -41,14 +41,13 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
   auto input_column = source_column->view();
   auto input_table  = cudf::table_view({input_column, input_column, input_column, input_column});
 
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    rmm::cuda_stream_view stream_view{launch.get_stream()};
-    auto result = cudf::detail::distinct(input_table,
-                                         {0},
-                                         cudf::duplicate_keep_option::KEEP_ANY,
-                                         cudf::null_equality::EQUAL,
-                                         cudf::nan_equality::ALL_EQUAL,
-                                         stream_view);
+    auto result = cudf::distinct(input_table,
+                                 {0},
+                                 cudf::duplicate_keep_option::KEEP_ANY,
+                                 cudf::null_equality::EQUAL,
+                                 cudf::nan_equality::ALL_EQUAL);
   });
 }
 
@@ -84,14 +83,13 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
   auto const table = create_random_table(
     {dtype}, table_size_bytes{static_cast<size_t>(size)}, data_profile{builder}, 0);
 
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    rmm::cuda_stream_view stream_view{launch.get_stream()};
-    auto result = cudf::detail::distinct(*table,
-                                         {0},
-                                         cudf::duplicate_keep_option::KEEP_ANY,
-                                         cudf::null_equality::EQUAL,
-                                         cudf::nan_equality::ALL_EQUAL,
-                                         stream_view);
+    auto result = cudf::distinct(*table,
+                                 {0},
+                                 cudf::duplicate_keep_option::KEEP_ANY,
+                                 cudf::null_equality::EQUAL,
+                                 cudf::nan_equality::ALL_EQUAL);
   });
 }
 
diff --git a/cpp/benchmarks/stream_compaction/unique.cpp b/cpp/benchmarks/stream_compaction/unique.cpp
index 6b586581408..652d55fb8ce 100644
--- a/cpp/benchmarks/stream_compaction/unique.cpp
+++ b/cpp/benchmarks/stream_compaction/unique.cpp
@@ -18,7 +18,7 @@
 #include <benchmarks/fixture/rmm_pool_raii.hpp>
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/stream_compaction.hpp>
+#include <cudf/stream_compaction.hpp>
 #include <cudf/types.hpp>
 
 #include <nvbench/nvbench.cuh>
@@ -62,10 +62,9 @@ void nvbench_unique(nvbench::state& state, nvbench::type_list<Type, nvbench::enu
   auto input_column = source_column->view();
   auto input_table  = cudf::table_view({input_column, input_column, input_column, input_column});
 
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
-    rmm::cuda_stream_view stream_view{launch.get_stream()};
-    auto result =
-      cudf::detail::unique(input_table, {0}, Keep, cudf::null_equality::EQUAL, stream_view);
+    auto result = cudf::unique(input_table, {0}, Keep, cudf::null_equality::EQUAL);
   });
 }
 
diff --git a/cpp/benchmarks/string/case.cpp b/cpp/benchmarks/string/case.cpp
index 1c43fa0f077..72b6fcaff0e 100644
--- a/cpp/benchmarks/string/case.cpp
+++ b/cpp/benchmarks/string/case.cpp
@@ -32,7 +32,7 @@ static void BM_case(benchmark::State& state)
   cudf::strings_column_view input(column->view());
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::strings::to_lower(input);
   }
 
diff --git a/cpp/benchmarks/string/combine.cpp b/cpp/benchmarks/string/combine.cpp
index a8d0224916b..46bcda9ae92 100644
--- a/cpp/benchmarks/string/combine.cpp
+++ b/cpp/benchmarks/string/combine.cpp
@@ -41,7 +41,7 @@ static void BM_combine(benchmark::State& state)
   cudf::string_scalar separator("+");
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::strings::concatenate(table->view(), separator);
   }
 
diff --git a/cpp/benchmarks/string/contains.cpp b/cpp/benchmarks/string/contains.cpp
index fd04d599e5e..f7f394ea048 100644
--- a/cpp/benchmarks/string/contains.cpp
+++ b/cpp/benchmarks/string/contains.cpp
@@ -85,7 +85,7 @@ static void BM_contains(benchmark::State& state, contains_type ct)
   auto pattern = patterns[pattern_index];
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     switch (ct) {
       case contains_type::contains:  // contains_re and matches_re use the same main logic
         cudf::strings::contains_re(input, pattern);
diff --git a/cpp/benchmarks/string/copy.cu b/cpp/benchmarks/string/copy.cu
index 318d2d524a3..669b12aa56b 100644
--- a/cpp/benchmarks/string/copy.cu
+++ b/cpp/benchmarks/string/copy.cu
@@ -58,7 +58,7 @@ static void BM_copy(benchmark::State& state, copy_type ct)
                        thrust::default_random_engine());
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     switch (ct) {
       case gather: cudf::gather(source->view(), index_map); break;
       case scatter: cudf::scatter(source->view(), index_map, target->view()); break;
diff --git a/cpp/benchmarks/string/factory.cu b/cpp/benchmarks/string/factory.cu
index 0e937b91e98..b75de16e901 100644
--- a/cpp/benchmarks/string/factory.cu
+++ b/cpp/benchmarks/string/factory.cu
@@ -55,7 +55,7 @@ static void BM_factory(benchmark::State& state)
     cudf::type_id::STRING, distribution_id::NORMAL, 0, max_str_length);
   auto const column = create_random_column(cudf::type_id::STRING, row_count{n_rows}, profile);
   auto d_column     = cudf::column_device_view::create(column->view());
-  rmm::device_uvector<string_pair> pairs(d_column->size(), cudf::default_stream_value);
+  rmm::device_uvector<string_pair> pairs(d_column->size(), cudf::get_default_stream());
   thrust::transform(thrust::device,
                     d_column->pair_begin<cudf::string_view, true>(),
                     d_column->pair_end<cudf::string_view, true>(),
@@ -63,7 +63,7 @@ static void BM_factory(benchmark::State& state)
                     string_view_to_pair{});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::make_strings_column(pairs);
   }
 
diff --git a/cpp/benchmarks/string/filter.cpp b/cpp/benchmarks/string/filter.cpp
index 4001fef5da6..cb805539651 100644
--- a/cpp/benchmarks/string/filter.cpp
+++ b/cpp/benchmarks/string/filter.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "string_bench_args.hpp"
+
 #include <benchmarks/common/generate_input.hpp>
 #include <benchmarks/fixture/benchmark_fixture.hpp>
 #include <benchmarks/synchronization/synchronization.hpp>
@@ -27,7 +29,6 @@
 #include <cudf/strings/translate.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <limits>
 #include <vector>
 
 enum FilterAPI { filter, filter_chars, strip };
@@ -49,7 +50,7 @@ static void BM_filter_chars(benchmark::State& state, FilterAPI api)
     {cudf::char_utf8{'a'}, cudf::char_utf8{'c'}}};
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     switch (api) {
       case filter: cudf::strings::filter_characters_of_type(input, types); break;
       case filter_chars: cudf::strings::filter_characters(input, filter_table); break;
@@ -62,21 +63,14 @@ static void BM_filter_chars(benchmark::State& state, FilterAPI api)
 
 static void generate_bench_args(benchmark::internal::Benchmark* b)
 {
-  int const min_rows   = 1 << 12;
-  int const max_rows   = 1 << 24;
-  int const row_mult   = 8;
-  int const min_rowlen = 1 << 5;
-  int const max_rowlen = 1 << 13;
-  int const len_mult   = 4;
-  for (int row_count = min_rows; row_count <= max_rows; row_count *= row_mult) {
-    for (int rowlen = min_rowlen; rowlen <= max_rowlen; rowlen *= len_mult) {
-      // avoid generating combinations that exceed the cudf column limit
-      size_t total_chars = static_cast<size_t>(row_count) * rowlen;
-      if (total_chars < static_cast<size_t>(std::numeric_limits<cudf::size_type>::max())) {
-        b->Args({row_count, rowlen});
-      }
-    }
-  }
+  int const min_rows          = 1 << 12;
+  int const max_rows          = 1 << 24;
+  int const row_multiplier    = 8;
+  int const min_length        = 1 << 5;
+  int const max_length        = 1 << 13;
+  int const length_multiplier = 2;
+  generate_string_bench_args(
+    b, min_rows, max_rows, row_multiplier, min_length, max_length, length_multiplier);
 }
 
 #define STRINGS_BENCHMARK_DEFINE(name)                                \
diff --git a/cpp/benchmarks/string/find.cpp b/cpp/benchmarks/string/find.cpp
index 62c76d18e1a..4ff3b59a491 100644
--- a/cpp/benchmarks/string/find.cpp
+++ b/cpp/benchmarks/string/find.cpp
@@ -45,7 +45,7 @@ static void BM_find_scalar(benchmark::State& state, FindAPI find_api)
   cudf::test::strings_column_wrapper targets({"+", "-"});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     switch (find_api) {
       case find: cudf::strings::find(input, target); break;
       case find_multi:
diff --git a/cpp/benchmarks/string/json.cu b/cpp/benchmarks/string/json.cu
index 5ee56c3cdae..d7c0066eb33 100644
--- a/cpp/benchmarks/string/json.cu
+++ b/cpp/benchmarks/string/json.cu
@@ -177,7 +177,8 @@ auto build_json_string_column(int desired_bytes, int num_rows)
   auto d_store_order = cudf::column_device_view::create(float_2bool_columns->get_column(2));
   json_benchmark_row_builder jb{
     desired_bytes, num_rows, {*d_books, *d_bicycles}, *d_book_pct, *d_misc_order, *d_store_order};
-  auto children = cudf::strings::detail::make_strings_children(jb, num_rows);
+  auto children = cudf::strings::detail::make_strings_children(
+    jb, num_rows, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   return cudf::make_strings_column(
     num_rows, std::move(children.first), std::move(children.second), 0, {});
 }
diff --git a/cpp/benchmarks/string/like.cpp b/cpp/benchmarks/string/like.cpp
index f6649b186a4..de7382f5a75 100644
--- a/cpp/benchmarks/string/like.cpp
+++ b/cpp/benchmarks/string/like.cpp
@@ -81,7 +81,7 @@ static void bench_like(nvbench::state& state)
   // This pattern forces reading the entire target string (when matched expected)
   auto pattern = std::string("% 5W4_");  // regex equivalent: ".* 5W4."
 
-  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::default_stream_value.value()));
+  state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
   // gather some throughput statistics as well
   auto chars_size = input.chars_size();
   state.add_element_count(chars_size, "chars_size");           // number of bytes;
diff --git a/cpp/benchmarks/string/repeat_strings.cpp b/cpp/benchmarks/string/repeat_strings.cpp
index db02fec13c2..1844e93bc53 100644
--- a/cpp/benchmarks/string/repeat_strings.cpp
+++ b/cpp/benchmarks/string/repeat_strings.cpp
@@ -55,7 +55,7 @@ static void BM_repeat_strings_scalar_times(benchmark::State& state)
   auto const strings_col    = cudf::strings_column_view(table->view().column(0));
 
   for ([[maybe_unused]] auto _ : state) {
-    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::default_stream_value);
+    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::strings::repeat_strings(strings_col, default_repeat_times);
   }
 
@@ -71,7 +71,7 @@ static void BM_repeat_strings_column_times(benchmark::State& state)
   auto const repeat_times_col = table->view().column(1);
 
   for ([[maybe_unused]] auto _ : state) {
-    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::default_stream_value);
+    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::strings::repeat_strings(strings_col, repeat_times_col);
   }
 
@@ -88,7 +88,7 @@ static void BM_compute_output_strings_sizes(benchmark::State& state)
   auto const repeat_times_col = table->view().column(1);
 
   for ([[maybe_unused]] auto _ : state) {
-    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::default_stream_value);
+    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col);
   }
 
@@ -107,7 +107,7 @@ static void BM_repeat_strings_column_times_precomputed_sizes(benchmark::State& s
     cudf::strings::repeat_strings_output_sizes(strings_col, repeat_times_col);
 
   for ([[maybe_unused]] auto _ : state) {
-    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::default_stream_value);
+    [[maybe_unused]] cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::strings::repeat_strings(strings_col, repeat_times_col, *sizes);
   }
 
diff --git a/cpp/benchmarks/string/replace.cpp b/cpp/benchmarks/string/replace.cpp
index e25bf679dbc..b25af14ec2a 100644
--- a/cpp/benchmarks/string/replace.cpp
+++ b/cpp/benchmarks/string/replace.cpp
@@ -48,7 +48,7 @@ static void BM_replace(benchmark::State& state, replace_type rt)
   cudf::test::strings_column_wrapper repls({"", ""});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     switch (rt) {
       case scalar: cudf::strings::replace(input, target, repl); break;
       case slice: cudf::strings::replace_slice(input, repl, 1, 10); break;
diff --git a/cpp/benchmarks/string/replace_re.cpp b/cpp/benchmarks/string/replace_re.cpp
index f8b03daa338..7e9d6036750 100644
--- a/cpp/benchmarks/string/replace_re.cpp
+++ b/cpp/benchmarks/string/replace_re.cpp
@@ -42,7 +42,7 @@ static void BM_replace(benchmark::State& state, replace_type rt)
   cudf::test::strings_column_wrapper repls({"#", ""});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     switch (rt) {
       case replace_type::replace_re:  // contains_re and matches_re use the same main logic
         cudf::strings::replace_re(input, "\\d+");
diff --git a/cpp/benchmarks/string/split.cpp b/cpp/benchmarks/string/split.cpp
index 3a7a96b025d..0f005c462cc 100644
--- a/cpp/benchmarks/string/split.cpp
+++ b/cpp/benchmarks/string/split.cpp
@@ -43,7 +43,7 @@ static void BM_split(benchmark::State& state, split_type rt)
   cudf::string_scalar target("+");
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     switch (rt) {
       case split: cudf::strings::split(input, target); break;
       case split_ws: cudf::strings::split(input); break;
diff --git a/cpp/benchmarks/string/substring.cpp b/cpp/benchmarks/string/substring.cpp
index 7ae5ad6f581..1201b240013 100644
--- a/cpp/benchmarks/string/substring.cpp
+++ b/cpp/benchmarks/string/substring.cpp
@@ -52,7 +52,7 @@ static void BM_substring(benchmark::State& state, substring_type rt)
   cudf::test::strings_column_wrapper delimiters(delim_itr, delim_itr + n_rows);
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     switch (rt) {
       case position: cudf::strings::slice_strings(input, 1, max_str_length / 2); break;
       case multi_position: cudf::strings::slice_strings(input, starts, stops); break;
diff --git a/cpp/benchmarks/string/translate.cpp b/cpp/benchmarks/string/translate.cpp
index 359a3756ef2..efc2fa3154b 100644
--- a/cpp/benchmarks/string/translate.cpp
+++ b/cpp/benchmarks/string/translate.cpp
@@ -53,7 +53,7 @@ static void BM_translate(benchmark::State& state, int entry_count)
                  });
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     cudf::strings::translate(input, entries);
   }
 
diff --git a/cpp/benchmarks/string/url_decode.cu b/cpp/benchmarks/string/url_decode.cu
index a884bc8b587..44681c924d0 100644
--- a/cpp/benchmarks/string/url_decode.cu
+++ b/cpp/benchmarks/string/url_decode.cu
@@ -91,7 +91,7 @@ void BM_url_decode(benchmark::State& state, int esc_seq_pct)
   auto strings_view = cudf::strings_column_view(column->view());
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     auto result = cudf::strings::url_decode(strings_view);
   }
 
diff --git a/cpp/benchmarks/synchronization/synchronization.hpp b/cpp/benchmarks/synchronization/synchronization.hpp
index e5882ff1c16..e56d881d459 100644
--- a/cpp/benchmarks/synchronization/synchronization.hpp
+++ b/cpp/benchmarks/synchronization/synchronization.hpp
@@ -35,7 +35,7 @@
       for (auto _ : state){
 
         // default stream, could be another stream
-        rmm::cuda_stream_view stream{cudf::default_stream_value};
+        rmm::cuda_stream_view stream{cudf::get_default_stream()};
 
         // Create (Construct) an object of this class. You HAVE to pass in the
         // benchmark::State object you are using. It measures the time from its
@@ -58,8 +58,7 @@
 
  */
 
-#ifndef CUDF_BENCH_SYNCHRONIZATION_H
-#define CUDF_BENCH_SYNCHRONIZATION_H
+#pragma once
 
 // Google Benchmark library
 #include <benchmark/benchmark.h>
@@ -85,7 +84,7 @@ class cuda_event_timer {
    */
   cuda_event_timer(benchmark::State& state,
                    bool flush_l2_cache,
-                   rmm::cuda_stream_view stream = cudf::default_stream_value);
+                   rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   // The user must provide a benchmark::State object to set
   // the timer so we disable the default c'tor.
@@ -102,5 +101,3 @@ class cuda_event_timer {
   rmm::cuda_stream_view stream;
   benchmark::State* p_state;
 };
-
-#endif
diff --git a/cpp/benchmarks/text/normalize.cpp b/cpp/benchmarks/text/normalize.cpp
index e5a0a1a95f4..91d873224d3 100644
--- a/cpp/benchmarks/text/normalize.cpp
+++ b/cpp/benchmarks/text/normalize.cpp
@@ -37,7 +37,7 @@ static void BM_normalize(benchmark::State& state, bool to_lower)
   cudf::strings_column_view input(column->view());
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     nvtext::normalize_characters(input, to_lower);
   }
 
diff --git a/cpp/benchmarks/text/normalize_spaces.cpp b/cpp/benchmarks/text/normalize_spaces.cpp
index 414cd119575..85eaf54d4ea 100644
--- a/cpp/benchmarks/text/normalize_spaces.cpp
+++ b/cpp/benchmarks/text/normalize_spaces.cpp
@@ -38,7 +38,7 @@ static void BM_normalize(benchmark::State& state)
   cudf::strings_column_view input(column->view());
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     nvtext::normalize_spaces(input);
   }
 
diff --git a/cpp/benchmarks/text/tokenize.cpp b/cpp/benchmarks/text/tokenize.cpp
index 4d8df6ae37c..4695a62f1c0 100644
--- a/cpp/benchmarks/text/tokenize.cpp
+++ b/cpp/benchmarks/text/tokenize.cpp
@@ -44,7 +44,7 @@ static void BM_tokenize(benchmark::State& state, tokenize_type tt)
   cudf::test::strings_column_wrapper delimiters({" ", "+", "-"});
 
   for (auto _ : state) {
-    cuda_event_timer raii(state, true, cudf::default_stream_value);
+    cuda_event_timer raii(state, true, cudf::get_default_stream());
     switch (tt) {
       case tokenize_type::single:
         // single whitespace delimiter
diff --git a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
index b1d2498f0e6..34b1e0254dd 100644
--- a/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
+++ b/cpp/benchmarks/type_dispatcher/type_dispatcher.cu
@@ -188,10 +188,10 @@ void type_dispatcher_benchmark(::benchmark::State& state)
   std::vector<rmm::device_buffer> h_vec(n_cols);
   std::vector<TypeParam*> h_vec_p(n_cols);
   std::transform(h_vec.begin(), h_vec.end(), h_vec_p.begin(), [source_size](auto& col) {
-    col.resize(source_size * sizeof(TypeParam), cudf::default_stream_value);
+    col.resize(source_size * sizeof(TypeParam), cudf::get_default_stream());
     return static_cast<TypeParam*>(col.data());
   });
-  rmm::device_uvector<TypeParam*> d_vec(n_cols, cudf::default_stream_value);
+  rmm::device_uvector<TypeParam*> d_vec(n_cols, cudf::get_default_stream());
 
   if (dispatching_type == NO_DISPATCHING) {
     CUDF_CUDA_TRY(cudaMemcpy(
diff --git a/cpp/cmake/Modules/ConfigureCUDA.cmake b/cpp/cmake/Modules/ConfigureCUDA.cmake
index 198435e739d..f79e4c37228 100644
--- a/cpp/cmake/Modules/ConfigureCUDA.cmake
+++ b/cpp/cmake/Modules/ConfigureCUDA.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -19,10 +19,14 @@ endif()
 list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
 
 # set warnings as errors
-list(APPEND CUDF_CUDA_FLAGS -Werror=cross-execution-space-call)
+if(CUDA_WARNINGS_AS_ERRORS)
+  list(APPEND CUDF_CUDA_FLAGS -Werror=all-warnings)
+else()
+  list(APPEND CUDF_CUDA_FLAGS -Werror=cross-execution-space-call)
+endif()
 list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wall,-Werror,-Wno-error=deprecated-declarations)
 
-if(DISABLE_DEPRECATION_WARNING)
+if(DISABLE_DEPRECATION_WARNINGS)
   list(APPEND CUDF_CXX_FLAGS -Wno-deprecated-declarations)
   list(APPEND CUDF_CUDA_FLAGS -Xcompiler=-Wno-deprecated-declarations)
 endif()
diff --git a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
index 8ce98c6d582..df285bdea55 100644
--- a/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
+++ b/cpp/cmake/Modules/JitifyPreprocessKernels.cmake
@@ -39,7 +39,8 @@ function(jit_preprocess_files)
       VERBATIM
       COMMAND ${CMAKE_COMMAND} -E make_directory "${jit_output_directory}"
       COMMAND
-        jitify_preprocess ${ARG_FILE} -o
+        "${CMAKE_COMMAND}" -E env LD_LIBRARY_PATH=${CUDAToolkit_LIBRARY_DIR}
+        $<TARGET_FILE:jitify_preprocess> ${ARG_FILE} -o
         ${CUDF_GENERATED_INCLUDE_DIR}/include/jit_preprocessed_files -i -m -std=c++17
         -remove-unused-globals -D_FILE_OFFSET_BITS=64 -D__CUDACC_RTC__ -I${CUDF_SOURCE_DIR}/include
         -I${CUDF_SOURCE_DIR}/src ${libcudacxx_includes} -I${CUDAToolkit_INCLUDE_DIRS}
diff --git a/cpp/cmake/config.json b/cpp/cmake/config.json
index 4f287499503..f7d7b001856 100644
--- a/cpp/cmake/config.json
+++ b/cpp/cmake/config.json
@@ -9,7 +9,7 @@
           "VERSION": "?",
           "GIT_SHALLOW": "?",
           "OPTIONS": "*",
-          "FIND_PACKAGE_ARGUMENTS": "*" 
+          "FIND_PACKAGE_ARGUMENTS": "*"
         }
       },
       "ConfigureTest": {
diff --git a/cpp/cmake/thirdparty/get_arrow.cmake b/cpp/cmake/thirdparty/get_arrow.cmake
index 9fa5b9d1658..94dcdcb5bc2 100644
--- a/cpp/cmake/thirdparty/get_arrow.cmake
+++ b/cpp/cmake/thirdparty/get_arrow.cmake
@@ -20,43 +20,98 @@
 
 # cmake-lint: disable=R0912,R0913,R0915
 
+include_guard(GLOBAL)
+
+# Generate a FindArrow module for the case where we need to search for arrow within a pip install
+# pyarrow.
+function(find_libarrow_in_python_wheel PYARROW_VERSION)
+  string(REPLACE "." "" PYARROW_SO_VER "${PYARROW_VERSION}")
+  set(PYARROW_LIB libarrow.so.${PYARROW_SO_VER})
+
+  find_package(Python REQUIRED)
+  execute_process(
+    COMMAND "${Python_EXECUTABLE}" -c "import pyarrow; print(pyarrow.get_library_dirs()[0])"
+    OUTPUT_VARIABLE CUDF_PYARROW_WHEEL_DIR
+    OUTPUT_STRIP_TRAILING_WHITESPACE
+  )
+  list(APPEND CMAKE_PREFIX_PATH "${CUDF_PYARROW_WHEEL_DIR}")
+  rapids_find_generate_module(
+    Arrow NO_CONFIG
+    VERSION "${PYARROW_VERSION}"
+    LIBRARY_NAMES "${PYARROW_LIB}"
+    BUILD_EXPORT_SET cudf-exports
+    INSTALL_EXPORT_SET cudf-exports
+    HEADER_NAMES arrow/python/arrow_to_pandas.h
+  )
+
+  find_package(Arrow ${PYARROW_VERSION} MODULE REQUIRED GLOBAL)
+  add_library(arrow_shared ALIAS Arrow::Arrow)
+
+  # When using the libarrow inside a wheel we must build libcudf with the old ABI because pyarrow's
+  # `libarrow.so` is compiled for manylinux2014 (centos7 toolchain) which uses the old ABI. Note
+  # that these flags will often be redundant because we build wheels in manylinux containers that
+  # actually have the old libc++ anyway, but setting them explicitly ensures correct and consistent
+  # behavior in all other cases such as aarch builds on newer manylinux or testing builds in newer
+  # containers. Note that tests will not build successfully without also propagating these options
+  # to builds of GTest. Similarly, benchmarks will not work without updating GBench (and possibly
+  # NVBench) builds. We are currently ignoring these limitations since we don't anticipate using
+  # this feature except for building wheels.
+  target_compile_options(
+    Arrow::Arrow INTERFACE "$<$<COMPILE_LANGUAGE:CXX>:-D_GLIBCXX_USE_CXX11_ABI=0>"
+                           "$<$<COMPILE_LANGUAGE:CUDA>:-Xcompiler=-D_GLIBCXX_USE_CXX11_ABI=0>"
+  )
+
+  rapids_export_package(BUILD Arrow cudf-exports)
+  rapids_export_package(INSTALL Arrow cudf-exports)
+
+  list(POP_BACK CMAKE_PREFIX_PATH)
+endfunction()
+
 # This function finds arrow and sets any additional necessary environment variables.
 function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENABLE_PYTHON
          ENABLE_PARQUET
 )
 
+  if(USE_LIBARROW_FROM_PYARROW)
+    # Generate a FindArrow.cmake to find pyarrow's libarrow.so
+    find_libarrow_in_python_wheel(${VERSION})
+    set(ARROW_FOUND
+        TRUE
+        PARENT_SCOPE
+    )
+    set(ARROW_LIBRARIES
+        arrow_shared
+        PARENT_SCOPE
+    )
+    return()
+  endif()
+
   if(BUILD_STATIC)
     if(TARGET arrow_static)
-      list(APPEND ARROW_LIBRARIES arrow_static)
       set(ARROW_FOUND
           TRUE
           PARENT_SCOPE
       )
       set(ARROW_LIBRARIES
-          ${ARROW_LIBRARIES}
+          arrow_static
           PARENT_SCOPE
       )
       return()
     endif()
   else()
     if(TARGET arrow_shared)
-      list(APPEND ARROW_LIBRARIES arrow_shared)
       set(ARROW_FOUND
           TRUE
           PARENT_SCOPE
       )
       set(ARROW_LIBRARIES
-          ${ARROW_LIBRARIES}
+          arrow_shared
           PARENT_SCOPE
       )
       return()
     endif()
   endif()
 
-  set(ARROW_BUILD_SHARED ON)
-  set(ARROW_BUILD_STATIC OFF)
-  set(CPMAddOrFindPackage CPMFindPackage)
-
   if(NOT ARROW_ARMV8_ARCH)
     set(ARROW_ARMV8_ARCH "armv8-a")
   endif()
@@ -69,8 +124,11 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     set(ARROW_BUILD_STATIC ON)
     set(ARROW_BUILD_SHARED OFF)
     # Turn off CPM using `find_package` so we always download and make sure we get proper static
-    # library
-    set(CPM_DOWNLOAD_ALL TRUE)
+    # library.
+    set(CPM_DOWNLOAD_Arrow TRUE)
+  else()
+    set(ARROW_BUILD_SHARED ON)
+    set(ARROW_BUILD_STATIC OFF)
   endif()
 
   set(ARROW_PYTHON_OPTIONS "")
@@ -91,7 +149,8 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
 
   rapids_cpm_find(
     Arrow ${VERSION}
-    GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared
+    GLOBAL_TARGETS arrow_shared parquet_shared arrow_dataset_shared arrow_static parquet_static
+                   arrow_dataset_static
     CPM_ARGS
     GIT_REPOSITORY https://github.com/apache/arrow.git
     GIT_TAG apache-arrow-${VERSION}
@@ -125,61 +184,65 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
             "xsimd_SOURCE AUTO"
   )
 
-  set(ARROW_FOUND TRUE)
-  set(ARROW_LIBRARIES "")
+  set(ARROW_FOUND
+      TRUE
+      PARENT_SCOPE
+  )
 
-  # Arrow_ADDED: set if CPM downloaded Arrow from Github Arrow_DIR:   set if CPM found Arrow on the
-  # system/conda/etc.
-  if(Arrow_ADDED OR Arrow_DIR)
-    if(BUILD_STATIC)
-      list(APPEND ARROW_LIBRARIES arrow_static)
-    else()
-      list(APPEND ARROW_LIBRARIES arrow_shared)
-    endif()
+  if(BUILD_STATIC)
+    set(ARROW_LIBRARIES arrow_static)
+  else()
+    set(ARROW_LIBRARIES arrow_shared)
+  endif()
 
-    if(Arrow_DIR)
-      find_package(Arrow REQUIRED QUIET)
-      if(ENABLE_PARQUET)
-        if(NOT Parquet_DIR)
-          # Set this to enable `find_package(Parquet)`
-          set(Parquet_DIR "${Arrow_DIR}")
-        endif()
-        # Set this to enable `find_package(ArrowDataset)`
-        set(ArrowDataset_DIR "${Arrow_DIR}")
-        find_package(ArrowDataset REQUIRED QUIET)
+  # Arrow_DIR:   set if CPM found Arrow on the system/conda/etc.
+  if(Arrow_DIR)
+    # This extra find_package is necessary because rapids_cpm_find does not propagate all the
+    # variables from find_package that we might need. This is especially problematic when
+    # rapids_cpm_find builds from source.
+    find_package(Arrow REQUIRED QUIET)
+    if(ENABLE_PARQUET)
+      # Setting Parquet_DIR is conditional because parquet may be installed independently of arrow.
+      if(NOT Parquet_DIR)
+        # Set this to enable `find_package(Parquet)`
+        set(Parquet_DIR "${Arrow_DIR}")
       endif()
-    elseif(Arrow_ADDED)
-      # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
-      # target_include_directories. That defeats ccache.
-      file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
-           DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
+      # Set this to enable `find_package(ArrowDataset)`
+      set(ArrowDataset_DIR "${Arrow_DIR}")
+      find_package(ArrowDataset REQUIRED QUIET)
+    endif()
+    # Arrow_ADDED: set if CPM downloaded Arrow from Github
+  elseif(Arrow_ADDED)
+    # Copy these files so we can avoid adding paths in Arrow_BINARY_DIR to
+    # target_include_directories. That defeats ccache.
+    file(INSTALL "${Arrow_BINARY_DIR}/src/arrow/util/config.h"
+         DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/arrow/util"
+    )
+    if(ENABLE_PARQUET)
+      file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
+           DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
       )
-      if(ENABLE_PARQUET)
-        file(INSTALL "${Arrow_BINARY_DIR}/src/parquet/parquet_version.h"
-             DESTINATION "${Arrow_SOURCE_DIR}/cpp/src/parquet"
-        )
-      endif()
-      #
-      # This shouldn't be necessary!
-      #
-      # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
-      # targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
-      #
-      # This only works because we know exactly which components we're using. Don't forget to update
-      # this list if we add more!
-      #
-      foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
-        target_include_directories(
-          ${ARROW_LIBRARY}
-          INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
-                    "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
-                    "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
-                    "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
-        )
-      endforeach()
     endif()
+    # Arrow populates INTERFACE_INCLUDE_DIRECTORIES for the `arrow_static` and `arrow_shared`
+    # targets in FindArrow, so for static source-builds, we have to do it after-the-fact.
+    #
+    # This only works because we know exactly which components we're using. Don't forget to update
+    # this list if we add more!
+    #
+    foreach(ARROW_LIBRARY ${ARROW_LIBRARIES})
+      target_include_directories(
+        ${ARROW_LIBRARY}
+        INTERFACE "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src>"
+                  "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/src/generated>"
+                  "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/hadoop/include>"
+                  "$<BUILD_INTERFACE:${Arrow_SOURCE_DIR}/cpp/thirdparty/flatbuffers/include>"
+      )
+    endforeach()
   else()
-    set(ARROW_FOUND FALSE)
+    set(ARROW_FOUND
+        FALSE
+        PARENT_SCOPE
+    )
     message(FATAL_ERROR "CUDF: Arrow library not found or downloaded.")
   endif()
 
@@ -294,15 +357,10 @@ function(find_and_configure_arrow VERSION BUILD_STATIC ENABLE_S3 ENABLE_ORC ENAB
     rapids_export_find_package_root(BUILD ArrowDataset [=[${CMAKE_CURRENT_LIST_DIR}]=] cudf-exports)
   endif()
 
-  set(ARROW_FOUND
-      "${ARROW_FOUND}"
-      PARENT_SCOPE
-  )
   set(ARROW_LIBRARIES
       "${ARROW_LIBRARIES}"
       PARENT_SCOPE
   )
-
 endfunction()
 
 if(NOT DEFINED CUDF_VERSION_Arrow)
diff --git a/cpp/cmake/thirdparty/get_dlpack.cmake b/cpp/cmake/thirdparty/get_dlpack.cmake
index 252d50c7af8..65b5f4ff2eb 100644
--- a/cpp/cmake/thirdparty/get_dlpack.cmake
+++ b/cpp/cmake/thirdparty/get_dlpack.cmake
@@ -1,5 +1,5 @@
 # =============================================================================
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 #
 # Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
 # in compliance with the License. You may obtain a copy of the License at
@@ -15,6 +15,7 @@
 # This function finds dlpack and sets any additional necessary environment variables.
 function(find_and_configure_dlpack VERSION)
 
+  include(${rapids-cmake-dir}/find/generate_module.cmake)
   rapids_find_generate_module(DLPACK HEADER_NAMES dlpack.h)
 
   rapids_cpm_find(
diff --git a/cpp/cmake/thirdparty/get_jitify.cmake b/cpp/cmake/thirdparty/get_jitify.cmake
index b7c90952c95..d98abdf8824 100644
--- a/cpp/cmake/thirdparty/get_jitify.cmake
+++ b/cpp/cmake/thirdparty/get_jitify.cmake
@@ -18,7 +18,7 @@
 function(find_and_configure_jitify)
   rapids_cpm_find(
     jitify 2.0.0
-    GIT_REPOSITORY https://github.com/NVIDIA/jitify.git
+    GIT_REPOSITORY https://github.com/rapidsai/jitify.git
     GIT_TAG jitify2
     GIT_SHALLOW TRUE
     DOWNLOAD_ONLY TRUE
diff --git a/cpp/cmake/thirdparty/get_thrust.cmake b/cpp/cmake/thirdparty/get_thrust.cmake
index 379b1521bf0..25a4c9dd3ba 100644
--- a/cpp/cmake/thirdparty/get_thrust.cmake
+++ b/cpp/cmake/thirdparty/get_thrust.cmake
@@ -13,73 +13,34 @@
 # =============================================================================
 
 # This function finds thrust and sets any additional necessary environment variables.
-function(find_and_configure_thrust VERSION)
-  # We only want to set `UPDATE_DISCONNECTED` while the GIT tag hasn't moved from the last time we
-  # cloned
-  set(cpm_thrust_disconnect_update "UPDATE_DISCONNECTED TRUE")
-  set(CPM_THRUST_CURRENT_VERSION
-      ${VERSION}
-      CACHE STRING "version of thrust we checked out"
-  )
-  if(NOT VERSION VERSION_EQUAL CPM_THRUST_CURRENT_VERSION)
-    set(CPM_THRUST_CURRENT_VERSION
-        ${VERSION}
-        CACHE STRING "version of thrust we checked out" FORCE
-    )
-    set(cpm_thrust_disconnect_update "")
-  endif()
+function(find_and_configure_thrust)
 
-  # We currently require cuDF to always build with a custom version of thrust. This is needed so
-  # that build times of of cudf are kept reasonable, without this CI builds of cudf will be killed
-  # as some source file can take over 45 minutes to build
-  #
-  set(CPM_DOWNLOAD_ALL TRUE)
-  rapids_cpm_find(
-    Thrust ${VERSION}
-    BUILD_EXPORT_SET cudf-exports
-    INSTALL_EXPORT_SET cudf-exports
-    CPM_ARGS
-    GIT_REPOSITORY https://github.com/NVIDIA/thrust.git
-    GIT_TAG ${VERSION}
-    GIT_SHALLOW TRUE ${cpm_thrust_disconnect_update}
-    PATCH_COMMAND patch --reject-file=- -p1 -N < ${CUDF_SOURCE_DIR}/cmake/thrust.patch || true
-    OPTIONS "THRUST_INSTALL TRUE"
-  )
+  include(${rapids-cmake-dir}/cpm/thrust.cmake)
+  include(${rapids-cmake-dir}/cpm/package_override.cmake)
 
-  if(NOT TARGET cudf::Thrust)
-    thrust_create_target(cudf::Thrust FROM_OPTIONS)
-  endif()
+  set(cudf_patch_dir "${CMAKE_CURRENT_FUNCTION_LIST_DIR}/patches")
+  rapids_cpm_package_override("${cudf_patch_dir}/thrust_override.json")
 
-  if(Thrust_SOURCE_DIR) # only install thrust when we have an in-source version
-    include(GNUInstallDirs)
-    install(
-      DIRECTORY "${Thrust_SOURCE_DIR}/thrust"
-      DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/"
-      FILES_MATCHING
-      REGEX "\\.(h|inl)$"
-    )
-    install(
-      DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub"
-      DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/"
-      FILES_MATCHING
-      PATTERN "*.cuh"
-    )
+  # Make sure we install thrust into the `include/libcudf` subdirectory instead of the default
+  include(GNUInstallDirs)
+  set(CMAKE_INSTALL_INCLUDEDIR "${CMAKE_INSTALL_INCLUDEDIR}/libcudf")
+  set(CMAKE_INSTALL_LIBDIR "${CMAKE_INSTALL_INCLUDEDIR}/lib")
 
-    install(DIRECTORY "${Thrust_SOURCE_DIR}/thrust/cmake"
-            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/thrust/"
-    )
-    install(DIRECTORY "${Thrust_SOURCE_DIR}/dependencies/cub/cub/cmake"
-            DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}/libcudf/Thrust/dependencies/cub/"
-    )
+  # Find or install Thrust with our custom set of patches
+  rapids_cpm_thrust(
+    NAMESPACE cudf
+    BUILD_EXPORT_SET cudf-exports
+    INSTALL_EXPORT_SET cudf-exports
+  )
 
+  if(Thrust_SOURCE_DIR)
     # Store where CMake can find our custom Thrust install
     include("${rapids-cmake-dir}/export/find_package_root.cmake")
     rapids_export_find_package_root(
-      INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/Thrust/]=] cudf-exports
+      INSTALL Thrust [=[${CMAKE_CURRENT_LIST_DIR}/../../../include/libcudf/lib/cmake/thrust]=]
+      cudf-exports
     )
   endif()
 endfunction()
 
-set(CUDF_MIN_VERSION_Thrust 1.17.2)
-
-find_and_configure_thrust(${CUDF_MIN_VERSION_Thrust})
+find_and_configure_thrust()
diff --git a/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
new file mode 100644
index 00000000000..382f7dca1b0
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_disable_64bit_dispatching.diff
@@ -0,0 +1,29 @@
+diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h
+index d0e3f94..76774b0 100644
+--- a/thrust/system/cuda/detail/dispatch.h
++++ b/thrust/system/cuda/detail/dispatch.h
+@@ -32,9 +32,8 @@
+         status = call arguments; \
+     } \
+     else { \
+-        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
+-        status = call arguments; \
+-    }
++       throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
++    }
+
+ /**
+  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+@@ -52,10 +51,8 @@
+         status = call arguments; \
+     } \
+     else { \
+-        auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
+-        auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
+-        status = call arguments; \
+-    }
++       throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
++    }
+ /**
+  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
+  * implementation. This version allows using different token sequences for callables
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff
new file mode 100644
index 00000000000..6bf165805cc
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_scan_compile_times.diff
@@ -0,0 +1,39 @@
+diff --git a/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh b/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+index b188c75f..3f36656f 100644
+--- a/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh
++++ b/dependencies/cub/cub/device/dispatch/dispatch_radix_sort.cuh
+@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy
+
+
+     /// SM60 (GP100)
+-    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
++    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+     {
+         enum {
+             PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
+diff --git a/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh b/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh
+index e0470ccb..6a0c2ed6 100644
+--- a/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh
++++ b/dependencies/cub/cub/device/dispatch/dispatch_reduce.cuh
+@@ -280,7 +280,7 @@ struct DeviceReducePolicy
+     };
+
+     /// SM60
+-    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
++    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+     {
+         // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
+         typedef AgentReducePolicy<
+diff --git a/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh b/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh
+index c2d04588..ac2d10e0 100644
+--- a/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh
++++ b/dependencies/cub/cub/device/dispatch/dispatch_scan.cuh
+@@ -177,7 +177,7 @@ struct DeviceScanPolicy
+     };
+
+     /// SM600
+-    struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
++    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
+     {
+         typedef AgentScanPolicy<
+                 128, 15,                                        ///< Threads per block, items per thread
diff --git a/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
new file mode 100644
index 00000000000..864c89d4504
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_faster_sort_compile_times.diff
@@ -0,0 +1,48 @@
+diff --git a/dependencies/cub/cub/block/block_merge_sort.cuh b/dependencies/cub/cub/block/block_merge_sort.cuh
+index 4769df36..d86d6342 100644
+--- a/dependencies/cub/cub/block/block_merge_sort.cuh
++++ b/dependencies/cub/cub/block/block_merge_sort.cuh
+@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
+   KeyT key1 = keys_shared[keys1_beg];
+   KeyT key2 = keys_shared[keys2_beg];
+
+-#pragma unroll
++#pragma unroll 1
+   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
+   {
+     bool p = (keys2_beg < keys2_end) &&
+@@ -383,7 +383,7 @@ public:
+       //
+       KeyT max_key = oob_default;
+
+-      #pragma unroll
++      #pragma unroll 1
+       for (int item = 1; item < ITEMS_PER_THREAD; ++item)
+       {
+         if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
+@@ -407,7 +407,7 @@ public:
+     // each thread has sorted keys
+     // merge sort keys in shared memory
+     //
+-    #pragma unroll
++    #pragma unroll 1
+     for (int target_merged_threads_number = 2;
+          target_merged_threads_number <= NUM_THREADS;
+          target_merged_threads_number *= 2)
+diff --git a/dependencies/cub/cub/thread/thread_sort.cuh b/dependencies/cub/cub/thread/thread_sort.cuh
+index 5d486789..b42fb5f0 100644
+--- a/dependencies/cub/cub/thread/thread_sort.cuh
++++ b/dependencies/cub/cub/thread/thread_sort.cuh
+@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
+ {
+   constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
+
+-  #pragma unroll
++  #pragma unroll 1
+   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
+   {
+-  #pragma unroll
++  #pragma unroll 1
+     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
+     {
+       if (compare_op(keys[j + 1], keys[j]))
diff --git a/cpp/cmake/thirdparty/patches/thrust_override.json b/cpp/cmake/thirdparty/patches/thrust_override.json
new file mode 100644
index 00000000000..f1908a64719
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_override.json
@@ -0,0 +1,34 @@
+
+{
+  "packages" : {
+    "Thrust" : {
+      "patches" : [
+        {
+          "file" : "Thrust/install_rules.diff",
+          "issue" : "Thrust 1.X installs incorrect files [https://github.com/NVIDIA/thrust/issues/1790]",
+          "fixed_in" : "2.0.0"
+        },
+        {
+          "file" : "${current_json_dir}/thrust_transform_iter_with_reduce_by_key.diff",
+          "issue" : "Support transform_output_iterator as output of reduce by key [https://github.com/NVIDIA/thrust/pull/1805]",
+          "fixed_in" : "2.1"
+        },
+        {
+          "file" : "${current_json_dir}/thrust_disable_64bit_dispatching.diff",
+          "issue" : "Remove 64bit dispatching as not needed by libcudf and results in compiling twice as many kernels [https://github.com/rapidsai/cudf/pull/11437]",
+          "fixed_in" : ""
+        },
+        {
+          "file" : "${current_json_dir}/thrust_faster_sort_compile_times.diff",
+          "issue" : "Improve Thrust sort compile times by not unrolling loops for inlined comparators [https://github.com/rapidsai/cudf/pull/10577]",
+          "fixed_in" : ""
+        },
+        {
+          "file" : "${current_json_dir}/thrust_faster_scan_compile_times.diff",
+          "issue" : "Improve Thrust scan compile times by reducing the number of kernels generated [https://github.com/rapidsai/cudf/pull/8183]",
+          "fixed_in" : ""
+        }
+      ]
+    }
+  }
+}
diff --git a/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff b/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff
new file mode 100644
index 00000000000..6a56af90d0d
--- /dev/null
+++ b/cpp/cmake/thirdparty/patches/thrust_transform_iter_with_reduce_by_key.diff
@@ -0,0 +1,26 @@
+diff --git a/thrust/iterator/transform_input_output_iterator.h b/thrust/iterator/transform_input_output_iterator.h
+index f512a36..a5f725d 100644
+--- a/thrust/iterator/transform_input_output_iterator.h
++++ b/thrust/iterator/transform_input_output_iterator.h
+@@ -102,6 +102,8 @@ template <typename InputFunction, typename OutputFunction, typename Iterator>
+   /*! \endcond
+    */
+
++  transform_input_output_iterator() = default;
++
+   /*! This constructor takes as argument a \c Iterator an \c InputFunction and an
+    * \c OutputFunction and copies them to a new \p transform_input_output_iterator
+    *
+diff --git a/thrust/iterator/transform_output_iterator.h b/thrust/iterator/transform_output_iterator.h
+index 66fb46a..4a68cb5 100644
+--- a/thrust/iterator/transform_output_iterator.h
++++ b/thrust/iterator/transform_output_iterator.h
+@@ -104,6 +104,8 @@ template <typename UnaryFunction, typename OutputIterator>
+   /*! \endcond
+    */
+
++  transform_output_iterator() = default;
++
+   /*! This constructor takes as argument an \c OutputIterator and an \c
+    * UnaryFunction and copies them to a new \p transform_output_iterator
+    *
diff --git a/cpp/cmake/thrust.patch b/cpp/cmake/thrust.patch
deleted file mode 100644
index ae1962e4738..00000000000
--- a/cpp/cmake/thrust.patch
+++ /dev/null
@@ -1,116 +0,0 @@
-diff --git a/cub/block/block_merge_sort.cuh b/cub/block/block_merge_sort.cuh
-index 4769df36..d86d6342 100644
---- a/cub/block/block_merge_sort.cuh
-+++ b/cub/block/block_merge_sort.cuh
-@@ -91,7 +91,7 @@ __device__ __forceinline__ void SerialMerge(KeyT *keys_shared,
-   KeyT key1 = keys_shared[keys1_beg];
-   KeyT key2 = keys_shared[keys2_beg];
-
--#pragma unroll
-+#pragma unroll 1
-   for (int item = 0; item < ITEMS_PER_THREAD; ++item)
-   {
-     bool p = (keys2_beg < keys2_end) &&
-@@ -383,7 +383,7 @@ public:
-       //
-       KeyT max_key = oob_default;
-
--      #pragma unroll
-+      #pragma unroll 1
-       for (int item = 1; item < ITEMS_PER_THREAD; ++item)
-       {
-         if (ITEMS_PER_THREAD * linear_tid + item < valid_items)
-@@ -407,7 +407,7 @@ public:
-     // each thread has sorted keys
-     // merge sort keys in shared memory
-     //
--    #pragma unroll
-+    #pragma unroll 1
-     for (int target_merged_threads_number = 2;
-          target_merged_threads_number <= NUM_THREADS;
-          target_merged_threads_number *= 2)
-diff --git a/cub/device/dispatch/dispatch_radix_sort.cuh b/cub/device/dispatch/dispatch_radix_sort.cuh
-index b188c75f..3f36656f 100644
---- a/cub/device/dispatch/dispatch_radix_sort.cuh
-+++ b/cub/device/dispatch/dispatch_radix_sort.cuh
-@@ -736,7 +736,7 @@ struct DeviceRadixSortPolicy
-
-
-     /// SM60 (GP100)
--    struct Policy600 : ChainedPolicy<600, Policy600, Policy500>
-+    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
-     {
-         enum {
-             PRIMARY_RADIX_BITS      = (sizeof(KeyT) > 1) ? 7 : 5,    // 6.9B 32b keys/s (Quadro P100)
-diff --git a/cub/device/dispatch/dispatch_reduce.cuh b/cub/device/dispatch/dispatch_reduce.cuh
-index e0470ccb..6a0c2ed6 100644
---- a/cub/device/dispatch/dispatch_reduce.cuh
-+++ b/cub/device/dispatch/dispatch_reduce.cuh
-@@ -280,7 +280,7 @@ struct DeviceReducePolicy
-     };
-
-     /// SM60
--    struct Policy600 : ChainedPolicy<600, Policy600, Policy350>
-+    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
-     {
-         // ReducePolicy (P100: 591 GB/s @ 64M 4B items; 583 GB/s @ 256M 1B items)
-         typedef AgentReducePolicy<
-diff --git a/cub/device/dispatch/dispatch_scan.cuh b/cub/device/dispatch/dispatch_scan.cuh
-index c2d04588..ac2d10e0 100644
---- a/cub/device/dispatch/dispatch_scan.cuh
-+++ b/cub/device/dispatch/dispatch_scan.cuh
-@@ -177,7 +177,7 @@ struct DeviceScanPolicy
-     };
-
-     /// SM600
--    struct Policy600 : ChainedPolicy<600, Policy600, Policy520>
-+    struct Policy600 : ChainedPolicy<600, Policy600, Policy600>
-     {
-         typedef AgentScanPolicy<
-                 128, 15,                                        ///< Threads per block, items per thread
-diff --git a/cub/thread/thread_sort.cuh b/cub/thread/thread_sort.cuh
-index 5d486789..b42fb5f0 100644
---- a/cub/thread/thread_sort.cuh
-+++ b/cub/thread/thread_sort.cuh
-@@ -83,10 +83,10 @@ StableOddEvenSort(KeyT (&keys)[ITEMS_PER_THREAD],
- {
-   constexpr bool KEYS_ONLY = std::is_same<ValueT, NullType>::value;
-
--  #pragma unroll
-+  #pragma unroll 1
-   for (int i = 0; i < ITEMS_PER_THREAD; ++i)
-   {
--  #pragma unroll
-+  #pragma unroll 1
-     for (int j = 1 & i; j < ITEMS_PER_THREAD - 1; j += 2)
-     {
-       if (compare_op(keys[j + 1], keys[j]))
-diff --git a/thrust/system/cuda/detail/dispatch.h b/thrust/system/cuda/detail/dispatch.h
-index d0e3f94..76774b0 100644
---- a/thrust/system/cuda/detail/dispatch.h
-+++ b/thrust/system/cuda/detail/dispatch.h
-@@ -32,9 +32,8 @@
-         status = call arguments; \
-     } \
-     else { \
--        auto THRUST_PP_CAT2(count, _fixed) = static_cast<thrust::detail::int64_t>(count); \
--        status = call arguments; \
--    }
-+       throw std::runtime_error("THRUST_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-+    }
-
- /**
-  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
-@@ -52,10 +51,8 @@
-         status = call arguments; \
-     } \
-     else { \
--        auto THRUST_PP_CAT2(count1, _fixed) = static_cast<thrust::detail::int64_t>(count1); \
--        auto THRUST_PP_CAT2(count2, _fixed) = static_cast<thrust::detail::int64_t>(count2); \
--        status = call arguments; \
--    }
-+       throw std::runtime_error("THRUST_DOUBLE_INDEX_TYPE_DISPATCH 64-bit count is unsupported in libcudf"); \
-+    }
- /**
-  * Dispatch between 32-bit and 64-bit index based versions of the same algorithm
-  * implementation. This version allows using different token sequences for callables
diff --git a/cpp/doxygen/Doxyfile b/cpp/doxygen/Doxyfile
index e7be02ab82b..4684e180f00 100644
--- a/cpp/doxygen/Doxyfile
+++ b/cpp/doxygen/Doxyfile
@@ -38,7 +38,7 @@ PROJECT_NAME           = "libcudf"
 # could be handy for archiving the generated documentation or if some version
 # control system is used.
 
-PROJECT_NUMBER         = 22.10.01
+PROJECT_NUMBER         = 22.12.00
 
 # Using the PROJECT_BRIEF tag one can provide an optional one line description
 # for a project that appears at the top of each page and should give viewer a
@@ -2162,7 +2162,7 @@ SKIP_FUNCTION_MACROS   = YES
 # the path). If a tag file is not located in the directory in which doxygen is
 # run, you must also specify the path to the tagfile here.
 
-TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/22.10
+TAGFILES               = rmm.tag=https://docs.rapids.ai/api/librmm/22.12
 
 # When a file name is specified after GENERATE_TAGFILE, doxygen will create a
 # tag file that is based on the input files it reads. See section "Linking to
diff --git a/cpp/doxygen/DoxygenLayout.xml b/cpp/doxygen/DoxygenLayout.xml
index a78a1cb701f..ded88dfe531 100644
--- a/cpp/doxygen/DoxygenLayout.xml
+++ b/cpp/doxygen/DoxygenLayout.xml
@@ -12,29 +12,29 @@
     </tab>
     <tab type="interfaces" visible="yes" title="">
       <tab type="interfacelist" visible="yes" title="" intro=""/>
-      <tab type="interfaceindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+      <tab type="interfaceindex" visible="$ALPHABETICAL_INDEX" title=""/>
       <tab type="interfacehierarchy" visible="yes" title="" intro=""/>
     </tab>
     <tab type="classes" visible="yes" title="">
       <tab type="classlist" visible="yes" title="" intro=""/>
-      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+      <tab type="classindex" visible="$ALPHABETICAL_INDEX" title=""/>
       <tab type="hierarchy" visible="yes" title="" intro=""/>
       <tab type="classmembers" visible="yes" title="" intro=""/>
     </tab>
     <tab type="structs" visible="yes" title="">
       <tab type="structlist" visible="yes" title="" intro=""/>
-      <tab type="structindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+      <tab type="structindex" visible="$ALPHABETICAL_INDEX" title=""/>
     </tab>
     <tab type="exceptions" visible="yes" title="">
       <tab type="exceptionlist" visible="yes" title="" intro=""/>
-      <tab type="exceptionindex" visible="$ALPHABETICAL_INDEX" title=""/> 
+      <tab type="exceptionindex" visible="$ALPHABETICAL_INDEX" title=""/>
       <tab type="exceptionhierarchy" visible="yes" title="" intro=""/>
     </tab>
     <tab type="files" visible="yes" title="">
       <tab type="filelist" visible="yes" title="" intro=""/>
       <tab type="globals" visible="yes" title="" intro=""/>
     </tab>
-    <tab type="examples" visible="yes" title="" intro=""/>  
+    <tab type="examples" visible="yes" title="" intro=""/>
   </navindex>
 
   <!-- Layout definition for a class page -->
diff --git a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
index b3774aeda38..3c085984a0e 100644
--- a/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
+++ b/cpp/doxygen/developer_guide/DEVELOPER_GUIDE.md
@@ -176,7 +176,7 @@ Resource ownership is an essential concept in libcudf. In short, an "owning" obj
 resource (such as device memory). It acquires that resource during construction and releases the
 resource in destruction ([RAII](https://en.cppreference.com/w/cpp/language/raii)). A "non-owning"
 object does not own resources. Any class in libcudf with the `*_view` suffix is non-owning. For more
-detail see the [`libcudf++` presentation.](https://docs.google.com/presentation/d/1zKzAtc1AWFKfMhiUlV5yRZxSiPLwsObxMlWRWz_f5hA/edit?usp=sharing)
+detail see the [`libcudf` presentation.](https://docs.google.com/presentation/d/1zKzAtc1AWFKfMhiUlV5yRZxSiPLwsObxMlWRWz_f5hA/edit?usp=sharing)
 
 libcudf functions typically take views as input (`column_view` or `table_view`)
 and produce `unique_ptr`s to owning objects as output. For example,
@@ -346,7 +346,72 @@ the device view can be obtained via function `column_device_view::create(column_
 data, a specialized device view for list columns can be constructed via
 `lists_column_device_view(column_device_view)`.
 
-# libcudf++ API and Implementation
+# libcudf Policies and Design Principles
+
+`libcudf` is designed to provide thread-safe, single-GPU accelerated algorithm primitives for solving a wide variety of problems that arise in data science.
+APIs are written to execute on the default GPU, which can be controlled by the caller through standard CUDA device APIs or environment variables like `CUDA_VISIBLE_DEVICES`.
+Our goal is to enable diverse use cases like Spark or Pandas to benefit from the performance of GPUs, and libcudf relies on these higher-level layers like Spark or Dask to orchestrate multi-GPU tasks.
+
+To best satisfy these use-cases, libcudf prioritizes performance and flexibility, which sometimes may come at the cost of convenience.
+While we welcome users to use libcudf directly, we design with the expectation that most users will be consuming libcudf through higher-level layers like Spark or cuDF Python that handle some of details that direct users of libcudf must handle on their own.
+We document these policies and the reasons behind them here.
+
+## libcudf does not introspect data
+
+libcudf APIs generally do not perform deep introspection and validation of input data.
+There are numerous reasons for this:
+1. It violates the single responsibility principle: validation is separate from execution.
+2. Since libcudf data structures store data on the GPU, any validation incurs _at minimum_ the overhead of a kernel launch, and may in general be prohibitively expensive.
+3. API promises around data introspection often significantly complicate implementation.
+
+Users are therefore responsible for passing valid data into such APIs.
+_Note that this policy does not mean that libcudf performs no validation whatsoever_.
+libcudf APIs should still perform any validation that does not require introspection.
+To give some idea of what should or should not be validated, here are (non-exhaustive) lists of examples.
+
+**Things that libcudf should validate**:
+- Input column/table sizes or dtypes
+
+**Things that libcudf should not validate**:
+- Integer overflow
+- Ensuring that outputs will not exceed the 2GB size limit for a given set of inputs
+
+
+## libcudf expects nested types to have sanitized null masks
+
+Various libcudf APIs accepting columns of nested dtypes (such as `LIST` or `STRUCT`) may assume that these columns have been sanitized.
+In this context, sanitization refers to ensuring that the null elements in a column with a nested dtype are compatible with the elements of nested columns.
+Specifically:
+- Null elements of list columns should also be empty. The starting offset of a null element should be equal to the ending offset.
+- Null elements of struct columns should also be null elements in the underlying structs.
+- For compound columns, nulls should only be present at the level of the parent column. Child columns should not contain nulls.
+- Slice operations on nested columns do not propagate offsets to child columns.
+
+libcudf APIs _should_ promise to never return "dirty" columns, i.e. columns containing unsanitized data.
+Therefore, the only problem is if users construct input columns that are not correctly sanitized and then pass those into libcudf APIs.
+
+## Treat libcudf APIs as if they were asynchronous
+
+libcudf APIs called on the host do not guarantee that the stream is synchronized before returning.
+Work in libcudf occurs on `cudf::get_default_stream().value`, which defaults to the CUDA default stream (stream 0).
+Note that the stream 0 behavior differs if [per-thread default stream is enabled](https://docs.nvidia.com/cuda/cuda-runtime-api/stream-sync-behavior.html) via `CUDF_USE_PER_THREAD_DEFAULT_STREAM`.
+Any data provided to or returned by libcudf that uses a separate non-blocking stream requires synchronization with the default libcudf stream to ensure stream safety.
+
+## libcudf generally does not make ordering guarantees
+
+Functions like merge or groupby in libcudf make no guarantees about the order of entries in the output.
+Promising deterministic ordering is not, in general, conducive to fast parallel algorithms.
+Calling code is responsible for performing sorts after the fact if sorted outputs are needed.
+
+## libcudf does not promise specific exception messages
+
+libcudf documents the exceptions that will be thrown by an API for different kinds of invalid inputs.
+The types of those exceptions (e.g. `cudf::logic_error`) are part of the public API.
+However, the explanatory string returned by the `what` method of those exceptions is not part of the API and is subject to change.
+Calling code should not rely on the contents of libcudf error messages to determine the nature of the error.
+For information on the types of exceptions that libcudf throws under different circumstances, see the [section on error handling](#errors).
+
+# libcudf API and Implementation
 
 ## Streams
 
@@ -359,7 +424,7 @@ internal API in the `detail` namespace. The internal `detail` API has the same p
 public API, plus a `rmm::cuda_stream_view` parameter at the end with no default value. If the
 detail API also accepts a memory resource parameter, the stream parameter should be ideally placed
 just *before* the memory resource. The public API will call the detail API and provide
-`cudf::default_stream_value`. The implementation should be wholly contained in the `detail` API
+`cudf::get_default_stream()`. The implementation should be wholly contained in the `detail` API
 definition and use only asynchronous versions of CUDA APIs with the stream parameter.
 
 In order to make the `detail` API callable from other libcudf functions, it should be exposed in a
@@ -390,7 +455,7 @@ namespace detail{
 
 void external_function(...){
     CUDF_FUNC_RANGE(); // Generates an NVTX range for the lifetime of this function.
-    detail::external_function(..., cudf::default_stream_value);
+    detail::external_function(..., cudf::get_default_stream());
 }
 ```
 
@@ -780,7 +845,7 @@ description of what has broken from the past release. Label pull requests that c
 with the "non-breaking" tag.
 
 
-# Error Handling
+# Error Handling {#errors}
 
 libcudf follows conventions (and provides utilities) enforcing compile-time and run-time
 conditions and detecting and handling CUDA errors. Communication of errors is always via C++
diff --git a/cpp/doxygen/developer_guide/DOCUMENTATION.md b/cpp/doxygen/developer_guide/DOCUMENTATION.md
index 8a7d89c8dbd..b86f7db82b0 100644
--- a/cpp/doxygen/developer_guide/DOCUMENTATION.md
+++ b/cpp/doxygen/developer_guide/DOCUMENTATION.md
@@ -1,4 +1,4 @@
-# libcudf++ C++ Documentation Guide
+# libcudf C++ Documentation Guide
 
 These guidelines apply to documenting all libcudf C++ source files using doxygen style formatting although only public APIs and classes are actually [published](https://docs.rapids.ai/api/libcudf/stable/index.html).
 
@@ -224,7 +224,7 @@ Also, \@copydoc is useful when documenting a `detail` function that differs only
      */
     std::vector<size_type> segmented_count_set_bits(bitmask_type const* bitmask,
                                                     std::vector<size_type> const& indices,
-                                                    rmm::cuda_stream_view stream = cudf::default_stream_value);
+                                                    rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 Note, you must specify the whole signature of the function, including optional parameters, so that doxygen will be able to locate it.
 
diff --git a/cpp/doxygen/developer_guide/TESTING.md b/cpp/doxygen/developer_guide/TESTING.md
index 31747e31ccb..198590bb35c 100644
--- a/cpp/doxygen/developer_guide/TESTING.md
+++ b/cpp/doxygen/developer_guide/TESTING.md
@@ -6,6 +6,13 @@ Unit tests in libcudf are written using
 **Important:** Instead of including `gtest/gtest.h` directly, use
 `#include <cudf_test/cudf_gtest.hpp>`.
 
+Also, write test code in the global namespace. That is,
+do not write test code in the `cudf` or the `cudf::test` namespace or their
+sub-namespaces.
+Likewise, do not use `using namespace cudf;` or `using namespace cudf::test;`
+in the global namespace.
+
+
 ## Best Practices: What Should We Test?
 
 In general we should test to make sure all code paths are covered. This is not always easy or
@@ -38,8 +45,8 @@ groupby).  Here are some other guidelines.
    does happen); columns with zero size but that somehow have non-null data pointers; and struct
    columns with no children.
 
- * Decimal types are not included in the `NumericTypes` type list, but are included in
-   `FixedWidthTypes`, so be careful that tests either include or exclude decimal types as
+ * Decimal types are not included in the `cudf::test::NumericTypes` type list, but are included in
+   `cudf::test::FixedWidthTypes`, so be careful that tests either include or exclude decimal types as
    appropriate.
 
 
@@ -99,8 +106,8 @@ list defined in `TestTypes` (`int, float, double`).
 
 The list of types that are used in tests should be consistent across all tests. To ensure
 consistency, several sets of common type lists are provided in
-`include/cudf_test/type_lists.hpp`. For example, `NumericTypes` is a type list of all numeric types,
-`FixedWidthTypes` is a list of all fixed-width element types, and `AllTypes` is a list of every
+`include/cudf_test/type_lists.hpp`. For example, `cudf::test::NumericTypes` is a type list of all numeric types,
+`FixedWidthTypes` is a list of all fixed-width element types, and `cudf::test::AllTypes` is a list of every
 element type that libcudf supports.
 
 ```c++
@@ -126,9 +133,8 @@ the `N`th type within the nested list, use `GetType<NestedList, N>`.
 Imagine testing all possible two-type combinations of `<int,float>`. This could be done manually:
 
 ```c++
-using namespace cudf::test;
 template <typename TwoTypes>
-TwoTypesFixture : BaseFixture{...};
+TwoTypesFixture : cudf::test::BaseFixture{...};
 using TwoTypesList = Types< Types<int, int>, Types<int, float>,
                             Types<float, int>, Types<float, float> >;
 TYPED_TEST_SUITE(TwoTypesFixture, TwoTypesList);
@@ -178,9 +184,9 @@ transparently passed to any API expecting a `column_view` or `mutable_column_vie
 
 #### fixed_width_column_wrapper
 
-The `fixed_width_column_wrapper` class should be used for constructing and initializing columns of
+The `cudf::test::fixed_width_column_wrapper` class should be used for constructing and initializing columns of
 any fixed-width element type, e.g., numeric types, timestamp types, Boolean, etc.
-`fixed_width_column_wrapper` provides constructors that accept an iterator range to generate each
+`cudf::test::fixed_width_column_wrapper` provides constructors that accept an iterator range to generate each
 element in the column. For nullable columns, an additional iterator can be provided to indicate the
 validity of each element. There are also constructors that accept a `std::initializer_list<T>` for
 the column elements and optionally for the validity of each element.
@@ -189,25 +195,25 @@ Example:
 
 ```c++
 // Creates a non-nullable column of INT32 elements with 5 elements: {0, 1, 2, 3, 4}
-auto elements = make_counting_transform_iterator(0, [](auto i){return i;});
-fixed_width_column_wrapper<int32_t> w(elements, elements + 5);
+auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i;});
+cudf::test::fixed_width_column_wrapper<int32_t> w(elements, elements + 5);
 
 // Creates a nullable column of INT32 elements with 5 elements: {null, 1, null, 3, null}
-auto elements = make_counting_transform_iterator(0, [](auto i){return i;});
-auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;})
-fixed_width_column_wrapper<int32_t> w(elements, elements + 5, validity);
+auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i;});
+auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;})
+cudf::test::fixed_width_column_wrapper<int32_t> w(elements, elements + 5, validity);
 
 // Creates a non-nullable INT32 column with 4 elements: {1, 2, 3, 4}
-fixed_width_column_wrapper<int32_t> w{{1, 2, 3, 4}};
+cudf::test::fixed_width_column_wrapper<int32_t> w{{1, 2, 3, 4}};
 
 // Creates a nullable INT32 column with 4 elements: {1, NULL, 3, NULL}
-fixed_width_column_wrapper<int32_t> w{ {1,2,3,4}, {1, 0, 1, 0}};
+cudf::test::fixed_width_column_wrapper<int32_t> w{ {1,2,3,4}, {1, 0, 1, 0}};
 ```
 
 #### fixed_point_column_wrapper
 
-The `fixed_point_column_wrapper` class should be used for constructing and initializing columns of
-any fixed-point element type (DECIMAL32 or DECIMAL64). `fixed_point_column_wrapper` provides
+The `cudf::test::fixed_point_column_wrapper` class should be used for constructing and initializing columns of
+any fixed-point element type (DECIMAL32 or DECIMAL64). `cudf::test::fixed_point_column_wrapper` provides
 constructors that accept an iterator range to generate each element in the column. For nullable
 columns, an additional iterator can be provided to indicate the validity of each element.
 Constructors also take the scale of the fixed-point values to create.
@@ -215,20 +221,20 @@ Constructors also take the scale of the fixed-point values to create.
 Example:
 
 ```c++
-    // Creates a non-nullable column of 4 DECIMAL32 elements of scale 3: {1000, 2000, 3000, 4000}
-    auto elements = make_counting_transform_iterator(0, [](auto i){ return i; });
-    fixed_point_column_wrapper<int32_t> w(elements, elements + 4, 3);
-
-    // Creates a nullable column of 5 DECIMAL32 elements of scale 2: {null, 100, null, 300, null}
-    auto elements = make_counting_transform_iterator(0, [](auto i){ return i; });
-    auto validity = make_counting_transform_iterator(0, [](auto i){ return i % 2; });
-    fixed_point_column_wrapper<int32_t> w(elements, elements + 5, validity, 2);
+// Creates a non-nullable column of 4 DECIMAL32 elements of scale 3: {1000, 2000, 3000, 4000}
+auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){ return i; });
+cudf::test::fixed_point_column_wrapper<int32_t> w(elements, elements + 4, 3);
+
+// Creates a nullable column of 5 DECIMAL32 elements of scale 2: {null, 100, null, 300, null}
+auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){ return i; });
+auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){ return i % 2; });
+cudf::test::fixed_point_column_wrapper<int32_t> w(elements, elements + 5, validity, 2);
 ```
 
 #### dictionary_column_wrapper
 
-The `dictionary_column_wrapper` class should be used to create dictionary columns.
-`dictionary_column_wrapper` provides constructors that accept an iterator range to generate each
+The `cudf::test::dictionary_column_wrapper` class should be used to create dictionary columns.
+`cudf::test::dictionary_column_wrapper` provides constructors that accept an iterator range to generate each
 element in the column. For nullable columns, an additional iterator can be provided to indicate the
 validity of each element. There are also constructors that accept a `std::initializer_list<T>` for
 the column elements and optionally for the validity of each element.
@@ -239,43 +245,43 @@ Example:
 // Creates a non-nullable dictionary column of INT32 elements with 5 elements
 // keys = {0, 2, 6}, indices = {0, 1, 1, 2, 2}
 std::vector<int32_t> elements{0, 2, 2, 6, 6};
-dictionary_column_wrapper<int32_t> w(element.begin(), elements.end());
+cudf::test::dictionary_column_wrapper<int32_t> w(element.begin(), elements.end());
 
 // Creates a nullable dictionary column with 5 elements and a validity iterator.
 std::vector<int32_t> elements{0, 2, 0, 6, 0};
 // Validity iterator here sets even rows to null.
-auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;})
+auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;})
 // keys = {2, 6}, indices = {NULL, 0, NULL, 1, NULL}
-dictionary_column_wrapper<int32_t> w(elements, elements + 5, validity);
+cudf::test::dictionary_column_wrapper<int32_t> w(elements, elements + 5, validity);
 
 // Creates a non-nullable dictionary column with 4 elements.
 // keys = {1, 2, 3}, indices = {0, 1, 2, 0}
-dictionary_column_wrapper<int32_t> w{{1, 2, 3, 1}};
+cudf::test::dictionary_column_wrapper<int32_t> w{{1, 2, 3, 1}};
 
 // Creates a nullable dictionary column with 4 elements and validity initializer.
 // keys = {1, 3}, indices = {0, NULL, 1, NULL}
-dictionary_column_wrapper<int32_t> w{ {1, 0, 3, 0}, {1, 0, 1, 0}};
+cudf::test::dictionary_column_wrapper<int32_t> w{ {1, 0, 3, 0}, {1, 0, 1, 0}};
 
 // Creates a nullable column of dictionary elements with 5 elements and validity initializer.
 std::vector<int32_t> elements{0, 2, 2, 6, 6};
 // keys = {2, 6}, indices = {NULL, 0, NULL, 1, NULL}
-dictionary_width_column_wrapper<int32_t> w(elements, elements + 5, {0, 1, 0, 1, 0});
+cudf::test::dictionary_width_column_wrapper<int32_t> w(elements, elements + 5, {0, 1, 0, 1, 0});
 
 // Creates a non-nullable dictionary column with 7 string elements
 std::vector<std::string> strings{"", "aaa", "bbb", "aaa", "bbb", "ccc", "bbb"};
 // keys = {"","aaa","bbb","ccc"}, indices = {0, 1, 2, 1, 2, 3, 2}
-dictionary_column_wrapper<std::string> d(strings.begin(), strings.end());
+cudf::test::dictionary_column_wrapper<std::string> d(strings.begin(), strings.end());
 
 // Creates a nullable dictionary column with 7 string elements and a validity iterator.
 // Validity iterator here sets even rows to null.
 // keys = {"a", "bb"}, indices = {NULL, 1, NULL, 1, NULL, 0, NULL}
-auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
-dictionary_column_wrapper<std::string> d({"", "bb", "", "bb", "", "a", ""}, validity);
+auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;});
+cudf::test::dictionary_column_wrapper<std::string> d({"", "bb", "", "bb", "", "a", ""}, validity);
 ```
 
 #### strings_column_wrapper
 
-The `strings_column_wrapper` class should be used to create columns of strings. It provides
+The `cudf::test::strings_column_wrapper` class should be used to create columns of strings. It provides
 constructors that accept an iterator range to generate each string in the column. For nullable
 columns, an additional iterator can be provided to indicate the validity of each string. There are
 also constructors that accept a `std::initializer_list<std::string>` for the column's strings and
@@ -287,27 +293,27 @@ Example:
 // Creates a non-nullable STRING column with 7 string elements:
 // {"", "this", "is", "a", "column", "of", "strings"}
 std::vector<std::string> strings{"", "this", "is", "a", "column", "of", "strings"};
-strings_column_wrapper s(strings.begin(), strings.end());
+cudf::test::strings_column_wrapper s(strings.begin(), strings.end());
 
 // Creates a nullable STRING column with 7 string elements:
 // {NULL, "this", NULL, "a", NULL, "of", NULL}
 std::vector<std::string> strings{"", "this", "is", "a", "column", "of", "strings"};
-auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
-strings_column_wrapper s(strings.begin(), strings.end(), validity);
+auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;});
+cudf::test::strings_column_wrapper s(strings.begin(), strings.end(), validity);
 
 // Creates a non-nullable STRING column with 7 string elements:
 // {"", "this", "is", "a", "column", "of", "strings"}
-strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"});
+cudf::test::strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"});
 
 // Creates a nullable STRING column with 7 string elements:
 // {NULL, "this", NULL, "a", NULL, "of", NULL}
-auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
-strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}, validity);
+auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;});
+cudf::test::strings_column_wrapper s({"", "this", "is", "a", "column", "of", "strings"}, validity);
 ```
 
 #### lists_column_wrapper
 
-The `lists_column_wrapper` class should be used to create columns of lists. It provides
+The `cudf::test::lists_column_wrapper` class should be used to create columns of lists. It provides
 constructors that accept an iterator range to generate each list in the column. For nullable
 columns, an additional iterator can be provided to indicate the validity of each list. There are
 also constructors that accept a `std::initializer_list<T>` for the column's lists and
@@ -318,50 +324,50 @@ Example:
 ```c++
 // Creates an empty LIST column
 // []
-lists_column_wrapper l{};
+cudf::test::lists_column_wrapper l{};
 
 // Creates a LIST column with 1 list composed of 2 total integers
 // [{0, 1}]
-lists_column_wrapper l{0, 1};
+cudf::test::lists_column_wrapper l{0, 1};
 
 // Creates a LIST column with 3 lists
 // [{0, 1}, {2, 3}, {4, 5}]
-lists_column_wrapper l{ {0, 1}, {2, 3}, {4, 5} };
+cudf::test::lists_column_wrapper l{ {0, 1}, {2, 3}, {4, 5} };
 
 // Creates a LIST of LIST columns with 2 lists on the top level and
 // 4 below
 // [ {{0, 1}, {2, 3}}, {{4, 5}, {6, 7}} ]
-lists_column_wrapper l{ {{0, 1}, {2, 3}}, {{4, 5}, {6, 7}} };
+cudf::test::lists_column_wrapper l{ {{0, 1}, {2, 3}}, {{4, 5}, {6, 7}} };
 
 // Creates a LIST column with 1 list composed of 5 total integers
 // [{0, 1, 2, 3, 4}]
-auto elements = make_counting_transform_iterator(0, [](auto i){return i*2;});
-lists_column_wrapper l(elements, elements+5);
+auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i*2;});
+cudf::test::lists_column_wrapper l(elements, elements+5);
 
 // Creates a LIST column with 1 lists composed of 2 total integers
 // [{0, NULL}]
-auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
-lists_column_wrapper l{{0, 1}, validity};
+auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;});
+cudf::test::lists_column_wrapper l{{0, 1}, validity};
 
 // Creates a LIST column with 1 lists composed of 5 total integers
 // [{0, NULL, 2, NULL, 4}]
-auto elements = make_counting_transform_iterator(0, [](auto i){return i*2;});
-auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
-lists_column_wrapper l(elements, elements+5, validity);
+auto elements = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i*2;});
+auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;});
+cudf::test::lists_column_wrapper l(elements, elements+5, validity);
 
 // Creates a LIST column with 1 list composed of 2 total strings
 // [{"abc", "def"}]
-lists_column_wrapper l{"abc", "def"};
+cudf::test::lists_column_wrapper l{"abc", "def"};
 
 // Creates a LIST of LIST columns with 2 lists on the top level and 4 below
 // [ {{0, 1}, NULL}, {{4, 5}, NULL} ]
-auto validity = make_counting_transform_iterator(0, [](auto i){return i % 2;});
-lists_column_wrapper l{ {{{0, 1}, {2, 3}}, validity}, {{{4, 5}, {6, 7}}, validity} };
+auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i){return i % 2;});
+cudf::test::lists_column_wrapper l{ {{{0, 1}, {2, 3}}, validity}, {{{4, 5}, {6, 7}}, validity} };
 ```
 
 #### structs_column_wrapper
 
-The `structs_column_wrapper` class should be used to create columns of structs. It provides
+The `cudf::test::structs_column_wrapper` class should be used to create columns of structs. It provides
 constructors that accept a vector or initializer list of pre-constructed columns or column wrappers
 for child columns. For nullable columns, an additional iterator can be provided to indicate the
 validity of each struct.
@@ -370,41 +376,41 @@ Examples:
 
 ```c++
 // The following constructs a column for struct< int, string >.
-auto child_int_col = fixed_width_column_wrapper<int32_t>{ 1, 2, 3, 4, 5 }.release();
-auto child_string_col = string_column_wrapper {"All", "the", "leaves", "are", "brown"}.release();
+auto child_int_col = cudf::test::fixed_width_column_wrapper<int32_t>{ 1, 2, 3, 4, 5 }.release();
+auto child_string_col = cudf::test::string_column_wrapper {"All", "the", "leaves", "are", "brown"}.release();
 
-std::vector<std::unique_ptr<column>> child_columns;
+std::vector<std::unique_ptr<cudf::column>> child_columns;
 child_columns.push_back(std::move(child_int_col));
 child_columns.push_back(std::move(child_string_col));
 
-struct_column_wrapper struct_column_wrapper{
+cudf::test::struct_col wrapper wrapper{
   child_cols,
   {1,0,1,0,1} // Validity
 };
 
-auto struct_col {struct_column_wrapper.release()};
+auto struct_col {wrapper.release()};
 
 // The following constructs a column for struct< int, string >.
-fixed_width_column_wrapper<int32_t> child_int_col_wrapper{ 1, 2, 3, 4, 5 };
-string_column_wrapper child_string_col_wrapper {"All", "the", "leaves", "are", "brown"};
+cudf::test::fixed_width_column_wrapper<int32_t> child_int_col_wrapper{ 1, 2, 3, 4, 5 };
+cudf::test::string_column_wrapper child_string_col_wrapper {"All", "the", "leaves", "are", "brown"};
 
-struct_column_wrapper struct_column_wrapper{
+cudf::test::struct_column_wrapper wrapper{
   {child_int_col_wrapper, child_string_col_wrapper}
   {1,0,1,0,1} // Validity
 };
 
-auto struct_col {struct_column_wrapper.release()};
+auto struct_col {wrapper.release()};
 
 // The following constructs a column for struct< int, string >.
-fixed_width_column_wrapper<int32_t> child_int_col_wrapper{ 1, 2, 3, 4, 5 };
-string_column_wrapper child_string_col_wrapper {"All", "the", "leaves", "are", "brown"};
+cudf::test::fixed_width_column_wrapper<int32_t> child_int_col_wrapper{ 1, 2, 3, 4, 5 };
+cudf::test::string_column_wrapper child_string_col_wrapper {"All", "the", "leaves", "are", "brown"};
 
-struct_column_wrapper struct_column_wrapper{
+cudf::test::struct_column_wrapper wrapper{
   {child_int_col_wrapper, child_string_col_wrapper}
   cudf::detail::make_counting_transform_iterator(0, [](auto i){ return i % 2; }) // Validity
 };
 
-auto struct_col {struct_column_wrapper.release()};
+auto struct_col {wrapper.release()};
 ```
 
 ### Column Comparison Utilities
diff --git a/cpp/doxygen/main_page.md b/cpp/doxygen/main_page.md
index 85b7888b066..308d10601af 100644
--- a/cpp/doxygen/main_page.md
+++ b/cpp/doxygen/main_page.md
@@ -1,5 +1,5 @@
-# libcudf 
+# libcudf
 
-libcudf is a C++ GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise 
+libcudf is a C++ GPU DataFrame library for loading, joining, aggregating, filtering, and otherwise
 manipulating data. A GPU DataFrame is a column-oriented tabular data structure, so libcudf provides
 two core data structures: cudf::column, and cudf::table.
diff --git a/cpp/doxygen/unicode.md b/cpp/doxygen/unicode.md
index d20a18ba34c..1ab09e110c1 100644
--- a/cpp/doxygen/unicode.md
+++ b/cpp/doxygen/unicode.md
@@ -2,7 +2,7 @@
 
 The strings column currently supports only UTF-8 characters internally.
 For functions that require character testing (e.g. cudf::strings::all_characters_of_type()) or
-case conversion (e.g. cudf::strings::capitalize(), etc) only the 16-bit [Unicode 13.0](http://www.unicode.org/versions/Unicode13.0.0) 
+case conversion (e.g. cudf::strings::capitalize(), etc) only the 16-bit [Unicode 13.0](http://www.unicode.org/versions/Unicode13.0.0)
 character code-points (0-65535) values are supported.
 Case conversion and character testing on characters above code-point 65535 are not supported.
 
diff --git a/cpp/examples/README.md b/cpp/examples/README.md
index 30b291d38f4..b2e8dd399d0 100644
--- a/cpp/examples/README.md
+++ b/cpp/examples/README.md
@@ -5,4 +5,5 @@ libcudf examples.
 
 Current examples:
 
-- Basic: example that demonstrates basic use case with libcudf and building a custom application with libcudf.
+- Basic: demonstrates a basic use case with libcudf and building a custom application with libcudf
+- Strings: demonstrates using libcudf for accessing and creating strings columns and for building custom kernels for strings
diff --git a/cpp/examples/basic/CMakeLists.txt b/cpp/examples/basic/CMakeLists.txt
index f4bc205d4ba..7e7c6b191b5 100644
--- a/cpp/examples/basic/CMakeLists.txt
+++ b/cpp/examples/basic/CMakeLists.txt
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-cmake_minimum_required(VERSION 3.18)
+cmake_minimum_required(VERSION 3.23.1)
 
 project(
   basic_example
@@ -16,7 +16,7 @@ file(
 )
 include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
 
-set(CUDF_TAG branch-22.10)
+set(CUDF_TAG branch-22.12)
 CPMFindPackage(
   NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
   GIT_TAG ${CUDF_TAG}
diff --git a/cpp/examples/basic/README.md b/cpp/examples/basic/README.md
index 75f16e54033..471dcf6694f 100644
--- a/cpp/examples/basic/README.md
+++ b/cpp/examples/basic/README.md
@@ -15,7 +15,7 @@ cmake -S . -B build/
 # Build
 cmake --build build/ --parallel $PARALLEL_LEVEL
 # Execute
-build/libcudf_example
+build/basic_example
 ```
 
 If your machine does not come with a pre-built libcudf binary, expect the
diff --git a/cpp/examples/basic/src/process_csv.cpp b/cpp/examples/basic/src/process_csv.cpp
index 5a3914da453..edd14d9ee5f 100644
--- a/cpp/examples/basic/src/process_csv.cpp
+++ b/cpp/examples/basic/src/process_csv.cpp
@@ -19,6 +19,10 @@
 #include <cudf/io/csv.hpp>
 #include <cudf/table/table.hpp>
 
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
 #include <memory>
 #include <string>
 #include <utility>
@@ -72,6 +76,21 @@ std::unique_ptr<cudf::table> average_closing_price(cudf::table_view stock_info_t
 
 int main(int argc, char** argv)
 {
+  // Construct a CUDA memory resource using RAPIDS Memory Manager (RMM)
+  // This is the default memory resource for libcudf for allocating device memory.
+  rmm::mr::cuda_memory_resource cuda_mr{};
+  // Construct a memory pool using the CUDA memory resource
+  // Using a memory pool for device memory allocations is important for good performance in libcudf.
+  // The pool defaults to allocating half of the available GPU memory.
+  rmm::mr::pool_memory_resource mr{&cuda_mr};
+
+  // Set the pool resource to be used by default for all device memory allocations
+  // Note: It is the user's responsibility to ensure the `mr` object stays alive for the duration of
+  // it being set as the default
+  // Also, call this before the first libcudf API call to ensure all data is allocated by the same
+  // memory resource.
+  rmm::mr::set_current_device_resource(&mr);
+
   // Read data
   auto stock_table_with_metadata = read_csv("4stock_5day.csv");
 
diff --git a/cpp/examples/build.sh b/cpp/examples/build.sh
index 079f7358872..7d389cd318d 100755
--- a/cpp/examples/build.sh
+++ b/cpp/examples/build.sh
@@ -17,8 +17,15 @@ LIB_BUILD_DIR=${LIB_BUILD_DIR:-$(readlink -f "${EXAMPLES_DIR}/../build")}
 # Basic example
 BASIC_EXAMPLE_DIR=${EXAMPLES_DIR}/basic
 BASIC_EXAMPLE_BUILD_DIR=${BASIC_EXAMPLE_DIR}/build
-
 # Configure
 cmake -S ${BASIC_EXAMPLE_DIR} -B ${BASIC_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
 # Build
 cmake --build ${BASIC_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
+
+# Strings example
+STRINGS_EXAMPLE_DIR=${EXAMPLES_DIR}/strings
+STRINGS_EXAMPLE_BUILD_DIR=${STRINGS_EXAMPLE_DIR}/build
+# Configure
+cmake -S ${STRINGS_EXAMPLE_DIR} -B ${STRINGS_EXAMPLE_BUILD_DIR} -Dcudf_ROOT="${LIB_BUILD_DIR}"
+# Build
+cmake --build ${STRINGS_EXAMPLE_BUILD_DIR} -j${PARALLEL_LEVEL}
diff --git a/cpp/examples/strings/CMakeLists.txt b/cpp/examples/strings/CMakeLists.txt
new file mode 100644
index 00000000000..1a16b2bc8fd
--- /dev/null
+++ b/cpp/examples/strings/CMakeLists.txt
@@ -0,0 +1,49 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+cmake_minimum_required(VERSION 3.23.1)
+
+project(
+  strings_examples
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+set(CPM_DOWNLOAD_VERSION v0.35.3)
+file(
+  DOWNLOAD
+  https://github.com/cpm-cmake/CPM.cmake/releases/download/${CPM_DOWNLOAD_VERSION}/get_cpm.cmake
+  ${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake
+)
+include(${CMAKE_BINARY_DIR}/cmake/get_cpm.cmake)
+
+set(CUDF_TAG branch-22.12)
+CPMFindPackage(
+  NAME cudf GIT_REPOSITORY https://github.com/rapidsai/cudf
+  GIT_TAG ${CUDF_TAG}
+  GIT_SHALLOW
+    TRUE
+    SOURCE_SUBDIR
+    cpp
+)
+
+list(APPEND CUDF_CUDA_FLAGS --expt-extended-lambda --expt-relaxed-constexpr)
+
+#
+add_executable(libcudf_apis libcudf_apis.cpp)
+target_compile_features(libcudf_apis PRIVATE cxx_std_17)
+target_link_libraries(libcudf_apis PRIVATE cudf::cudf nvToolsExt)
+
+add_executable(custom_with_malloc custom_with_malloc.cu)
+target_compile_features(custom_with_malloc PRIVATE cxx_std_17)
+target_compile_options(custom_with_malloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
+target_link_libraries(custom_with_malloc PRIVATE cudf::cudf nvToolsExt)
+
+add_executable(custom_prealloc custom_prealloc.cu)
+target_compile_features(custom_prealloc PRIVATE cxx_std_17)
+target_compile_options(custom_prealloc PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
+target_link_libraries(custom_prealloc PRIVATE cudf::cudf nvToolsExt)
+
+add_executable(custom_optimized custom_optimized.cu)
+target_compile_features(custom_optimized PRIVATE cxx_std_17)
+target_compile_options(custom_optimized PRIVATE "$<$<COMPILE_LANGUAGE:CUDA>:${CUDF_CUDA_FLAGS}>")
+target_link_libraries(custom_optimized PRIVATE cudf::cudf nvToolsExt)
diff --git a/cpp/examples/strings/README.md b/cpp/examples/strings/README.md
new file mode 100644
index 00000000000..241aa064bcc
--- /dev/null
+++ b/cpp/examples/strings/README.md
@@ -0,0 +1,37 @@
+# libcudf C++ examples using strings columns
+
+This C++ example demonstrates using libcudf APIs to access and create
+strings columns.
+
+The example source code loads a csv file and produces a redacted strings
+column from the names column using the values from the visibilities column.
+
+Four examples are included:
+1. Using libcudf APIs to build the output
+2. Using a simple custom kernel with dynamic memory
+3. Using a custom kernel with pre-allocated device memory
+4. Using a two-pass approach to improve performance
+
+These examples are described in more detail in
+https://developer.nvidia.com/blog/mastering-string-transformations-in-rapids-libcudf/
+
+## Compile and execute
+
+```bash
+# Configure project
+cmake -S . -B build/
+# Build
+cmake --build build/ --parallel $PARALLEL_LEVEL
+# Execute
+build/libcudf_apis names.csv
+--OR--
+build/custom_with_malloc names.csv
+--OR--
+build/custom_prealloc names.csv
+--OR--
+build/custom_optimized names.csv
+```
+
+If your machine does not come with a pre-built libcudf binary, expect the
+first build to take some time, as it would build libcudf on the host machine.
+It may be sped up by configuring the proper `PARALLEL_LEVEL` number.
diff --git a/cpp/examples/strings/common.hpp b/cpp/examples/strings/common.hpp
new file mode 100644
index 00000000000..dbd3c4dbd1b
--- /dev/null
+++ b/cpp/examples/strings/common.hpp
@@ -0,0 +1,114 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/io/csv.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
+#include <rmm/mr/device/cuda_memory_resource.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+#include <rmm/mr/device/owning_wrapper.hpp>
+#include <rmm/mr/device/pool_memory_resource.hpp>
+
+#include <chrono>
+#include <iostream>
+#include <memory>
+#include <string>
+
+/**
+ * @brief Main example function returns redacted strings column.
+ *
+ * This function returns a redacted version of the input `names` column
+ * using the the `visibilities` column as in the following example
+ * ```
+ * names        visibility  --> redacted
+ * John Doe     public          D John
+ * Bobby Joe    private         X X
+ * ```
+ *
+ * @param names First and last names separated with a single space
+ * @param visibilities String values `public` or `private` only
+ * @return Redacted strings column
+ */
+std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
+                                             cudf::column_view const& visibilities);
+
+/**
+ * @brief Create CUDA memory resource
+ */
+auto make_cuda_mr() { return std::make_shared<rmm::mr::cuda_memory_resource>(); }
+
+/**
+ * @brief Create a pool device memory resource
+ */
+auto make_pool_mr()
+{
+  return rmm::mr::make_owning_wrapper<rmm::mr::pool_memory_resource>(make_cuda_mr());
+}
+
+/**
+ * @brief Create memory resource for libcudf functions
+ */
+std::shared_ptr<rmm::mr::device_memory_resource> create_memory_resource(std::string const& name)
+{
+  if (name == "pool") { return make_pool_mr(); }
+  return make_cuda_mr();
+}
+
+/**
+ * @brief Main for strings examples
+ *
+ * Command line parameters:
+ * 1. CSV file name/path
+ * 2. Memory resource (optional): 'pool' or 'cuda'
+ *
+ * The stdout includes the number of rows in the input and the output size in bytes.
+ */
+int main(int argc, char const** argv)
+{
+  if (argc < 2) {
+    std::cout << "required parameter: csv-file-path\n";
+    return 1;
+  }
+
+  auto const mr_name = std::string{argc > 2 ? std::string(argv[2]) : std::string("cuda")};
+  auto resource      = create_memory_resource(mr_name);
+  rmm::mr::set_current_device_resource(resource.get());
+
+  auto const csv_file   = std::string{argv[1]};
+  auto const csv_result = [csv_file] {
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_file}).header(-1);
+    return cudf::io::read_csv(in_opts).tbl;
+  }();
+  auto const csv_table = csv_result->view();
+
+  std::cout << "table: " << csv_table.num_rows() << " rows " << csv_table.num_columns()
+            << " columns\n";
+
+  auto st     = std::chrono::steady_clock::now();
+  auto result = redact_strings(csv_table.column(0), csv_table.column(1));
+
+  std::chrono::duration<double> elapsed = std::chrono::steady_clock::now() - st;
+  std::cout << "Wall time: " << elapsed.count() << " seconds\n";
+  std::cout << "Output size " << result->view().child(1).size() << " bytes\n";
+
+  return 0;
+}
diff --git a/cpp/examples/strings/custom_optimized.cu b/cpp/examples/strings/custom_optimized.cu
new file mode 100644
index 00000000000..bfe650daa93
--- /dev/null
+++ b/cpp/examples/strings/custom_optimized.cu
@@ -0,0 +1,165 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/scan.h>
+
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+
+/**
+ * @brief Computes the size of each output row
+ *
+ * This thread is called once per row in d_names.
+ *
+ * @param d_names Column of names
+ * @param d_visibilities Column of visibilities
+ * @param d_sizes Output sizes for each row
+ */
+__global__ void sizes_kernel(cudf::column_device_view const d_names,
+                             cudf::column_device_view const d_visibilities,
+                             cudf::size_type* d_sizes)
+{
+  // The row index is resolved from the CUDA thread/block objects
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
+  if (index >= d_names.size()) return;
+
+  auto const visible   = cudf::string_view("public", 6);
+  auto const redaction = cudf::string_view("X X", 3);
+
+  auto const name = d_names.element<cudf::string_view>(index);
+  auto const vis  = d_visibilities.element<cudf::string_view>(index);
+
+  cudf::size_type result = redaction.size_bytes();  // init to redaction size
+  if (vis == visible) {
+    auto const space_idx    = name.find(' ');
+    auto const first        = name.substr(0, space_idx);
+    auto const last_initial = name.substr(space_idx + 1, 1);
+
+    result = first.size_bytes() + last_initial.size_bytes() + 1;
+  }
+
+  d_sizes[index] = result;
+}
+
+/**
+ * @brief Builds the output for each row
+ *
+ * This thread is called once per row in d_names.
+ *
+ * @param d_names Column of names
+ * @param d_visibilities Column of visibilities
+ * @param d_offsets Byte offset in `d_chars` for each row
+ * @param d_chars Output memory for all rows
+ */
+__global__ void redact_kernel(cudf::column_device_view const d_names,
+                              cudf::column_device_view const d_visibilities,
+                              cudf::size_type const* d_offsets,
+                              char* d_chars)
+{
+  // The row index is resolved from the CUDA thread/block objects
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
+  if (index >= d_names.size()) return;
+
+  auto const visible   = cudf::string_view("public", 6);
+  auto const redaction = cudf::string_view("X X", 3);
+
+  // resolve output_ptr using the offsets vector
+  char* output_ptr = d_chars + d_offsets[index];
+
+  auto const name = d_names.element<cudf::string_view>(index);
+  auto const vis  = d_visibilities.element<cudf::string_view>(index);
+
+  if (vis == visible) {
+    auto const space_idx    = name.find(' ');
+    auto const first        = name.substr(0, space_idx);
+    auto const last_initial = name.substr(space_idx + 1, 1);
+    auto const output_size  = first.size_bytes() + last_initial.size_bytes() + 1;
+
+    // build output string
+    memcpy(output_ptr, last_initial.data(), last_initial.size_bytes());
+    output_ptr += last_initial.size_bytes();
+    *output_ptr++ = ' ';
+    memcpy(output_ptr, first.data(), first.size_bytes());
+  } else {
+    memcpy(output_ptr, redaction.data(), redaction.size_bytes());
+  }
+}
+
+/**
+ * @brief Redacts each name per the corresponding visibility entry
+ *
+ * This implementation builds the strings column children (offsets and chars)
+ * directly into device memory for libcudf.
+ *
+ * @param names Column of names
+ * @param visibilities Column of visibilities
+ * @return Redacted column of names
+ */
+std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
+                                             cudf::column_view const& visibilities)
+{
+  // all device memory operations and kernel functions will run on this stream
+  auto stream = rmm::cuda_stream_default;
+
+  auto const d_names        = cudf::column_device_view::create(names, stream);
+  auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
+
+  constexpr int block_size = 128;  // this arbitrary size should be a power of 2
+  int const blocks         = (names.size() + block_size - 1) / block_size;
+
+  nvtxRangePushA("redact_strings");
+
+  // create offsets vector
+  auto offsets = rmm::device_uvector<cudf::size_type>(names.size() + 1, stream);
+
+  // compute output sizes
+  sizes_kernel<<<blocks, block_size, 0, stream.value()>>>(
+    *d_names, *d_visibilities, offsets.data());
+
+  // convert sizes to offsets (in place)
+  thrust::exclusive_scan(rmm::exec_policy(stream), offsets.begin(), offsets.end(), offsets.begin());
+
+  // last element is the total output size
+  // (device-to-host copy of 1 integer -- includes synching the stream)
+  cudf::size_type output_size = offsets.back_element(stream);
+
+  //  create chars vector
+  auto chars = rmm::device_uvector<char>(output_size, stream);
+
+  // build chars output
+  redact_kernel<<<blocks, block_size, 0, stream.value()>>>(
+    *d_names, *d_visibilities, offsets.data(), chars.data());
+
+  // create column from offsets and chars vectors (no copy is performed)
+  auto result = cudf::make_strings_column(names.size(), std::move(offsets), std::move(chars));
+
+  // wait for all of the above to finish
+  stream.synchronize();
+
+  nvtxRangePop();
+  return result;
+}
diff --git a/cpp/examples/strings/custom_prealloc.cu b/cpp/examples/strings/custom_prealloc.cu
new file mode 100644
index 00000000000..c0bae03af5c
--- /dev/null
+++ b/cpp/examples/strings/custom_prealloc.cu
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+
+/**
+ * @brief Builds the output for each row
+ *
+ * This thread is called once per row in d_names.
+ *
+ * @param d_names Column of names
+ * @param d_visibilities Column of visibilities
+ * @param redaction Redacted string replacement
+ * @param working_memory Output memory for all rows
+ * @param d_offsets Byte offset in `d_chars` for each row
+ * @param d_output Output array of string_view objects
+ */
+__global__ void redact_kernel(cudf::column_device_view const d_names,
+                              cudf::column_device_view const d_visibilities,
+                              cudf::string_view redaction,
+                              char* working_memory,
+                              cudf::offset_type const* d_offsets,
+                              cudf::string_view* d_output)
+{
+  // The row index is resolved from the CUDA thread/block objects
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
+  if (index >= d_names.size()) return;
+
+  auto const visible = cudf::string_view("public", 6);
+
+  auto const name = d_names.element<cudf::string_view>(index);
+  auto const vis  = d_visibilities.element<cudf::string_view>(index);
+  if (vis == visible) {
+    auto const space_idx    = name.find(' ');
+    auto const first        = name.substr(0, space_idx);
+    auto const last_initial = name.substr(space_idx + 1, 1);
+    auto const output_size  = first.size_bytes() + last_initial.size_bytes() + 1;
+
+    char* output_ptr = working_memory + d_offsets[index];
+    d_output[index]  = cudf::string_view{output_ptr, output_size};
+
+    // build output string
+    memcpy(output_ptr, last_initial.data(), last_initial.size_bytes());
+    output_ptr += last_initial.size_bytes();
+    *output_ptr++ = ' ';
+    memcpy(output_ptr, first.data(), first.size_bytes());
+  } else {
+    d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()};
+  }
+}
+
+/**
+ * @brief Redacts each name per the corresponding visibility entry
+ *
+ * This implementation builds the individual strings into a fixed memory buffer
+ * and then calls a factory function to gather them into a strings column.
+ *
+ * @param names Column of names
+ * @param visibilities Column of visibilities
+ * @return Redacted column of names
+ */
+std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
+                                             cudf::column_view const& visibilities)
+{
+  // all device memory operations and kernel functions will run on this stream
+  auto stream = rmm::cuda_stream_default;
+
+  auto const d_names        = cudf::column_device_view::create(names, stream);
+  auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
+  auto const d_redaction    = cudf::string_scalar(std::string("X X"), true, stream);
+
+  constexpr int block_size = 128;  // this arbitrary size should be a power of 2
+  auto const blocks        = (names.size() + block_size - 1) / block_size;
+
+  nvtxRangePushA("redact_strings");
+
+  auto const scv     = cudf::strings_column_view(names);
+  auto const offsets = scv.offsets_begin();
+
+  // create working memory to hold the output of each string
+  auto working_memory = rmm::device_uvector<char>(scv.chars_size(), stream);
+  // create a vector for the output strings' pointers
+  auto str_ptrs = rmm::device_uvector<cudf::string_view>(names.size(), stream);
+
+  // build the output strings
+  redact_kernel<<<blocks, block_size, 0, stream.value()>>>(*d_names,
+                                                           *d_visibilities,
+                                                           d_redaction.value(),
+                                                           working_memory.data(),
+                                                           offsets,
+                                                           str_ptrs.data());
+
+  // create strings column from the string_pairs;
+  // this copies all the individual strings into a single output column
+  auto result = cudf::make_strings_column(str_ptrs, cudf::string_view{nullptr, 0}, stream);
+  // temporary memory cleanup cost here for str_ptrs and working_memory
+
+  // wait for all of the above to finish
+  stream.synchronize();
+
+  nvtxRangePop();
+  return result;
+}
diff --git a/cpp/examples/strings/custom_with_malloc.cu b/cpp/examples/strings/custom_with_malloc.cu
new file mode 100644
index 00000000000..f1d397ef007
--- /dev/null
+++ b/cpp/examples/strings/custom_with_malloc.cu
@@ -0,0 +1,158 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.hpp"
+
+#include <cudf/column/column_device_view.cuh>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/scalar/scalar.hpp>
+
+#include <rmm/device_uvector.hpp>
+
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+
+/**
+ * @brief Reserve CUDA malloc heap size
+ *
+ * Call this function to change the CUDA malloc heap size limit.
+ * This value depends on the total size of all the malloc()
+ * calls needed for redact_kernel.
+ *
+ * @param heap_size Number of bytes to reserve
+ *                  Default is 1GB
+ */
+void set_malloc_heap_size(size_t heap_size = 1073741824)  // 1GB
+{
+  size_t max_malloc_heap_size = 0;
+  cudaDeviceGetLimit(&max_malloc_heap_size, cudaLimitMallocHeapSize);
+  if (max_malloc_heap_size < heap_size) {
+    max_malloc_heap_size = heap_size;
+    if (cudaDeviceSetLimit(cudaLimitMallocHeapSize, max_malloc_heap_size) != cudaSuccess) {
+      fprintf(stderr, "could not set malloc heap size to %ldMB\n", (heap_size / (1024 * 1024)));
+      throw std::runtime_error("");
+    }
+  }
+}
+
+/**
+ * @brief Builds the output for each row
+ *
+ * This thread is called once per row in d_names.
+ *
+ * Note: This uses malloc() in a device kernel which works great
+ * but is not very efficient. This can be useful for prototyping
+ * on functions where performance is not yet important.
+ * All calls to malloc() must have a corresponding free() call.
+ * The separate free_kernel is launched for this purpose.
+ *
+ * @param d_names Column of names
+ * @param d_visibilities Column of visibilities
+ * @param redaction Redacted string replacement
+ * @param d_output Output array of string_view objects
+ */
+__global__ void redact_kernel(cudf::column_device_view const d_names,
+                              cudf::column_device_view const d_visibilities,
+                              cudf::string_view redaction,
+                              cudf::string_view* d_output)
+{
+  // The row index is resolved from the CUDA thread/block objects
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  // There may be more threads than actual rows
+  if (index >= d_names.size()) return;
+
+  auto const visible = cudf::string_view("public", 6);
+
+  auto const name = d_names.element<cudf::string_view>(index);
+  auto const vis  = d_visibilities.element<cudf::string_view>(index);
+  if (vis == visible) {
+    auto const space_idx    = name.find(' ');
+    auto const first        = name.substr(0, space_idx);
+    auto const last_initial = name.substr(space_idx + 1, 1);
+    auto const output_size  = first.size_bytes() + last_initial.size_bytes() + 1;
+
+    char* output_ptr = static_cast<char*>(malloc(output_size));
+    d_output[index]  = cudf::string_view{output_ptr, output_size};
+
+    // build output string
+    memcpy(output_ptr, last_initial.data(), last_initial.size_bytes());
+    output_ptr += last_initial.size_bytes();
+    *output_ptr++ = ' ';
+    memcpy(output_ptr, first.data(), first.size_bytes());
+  } else {
+    d_output[index] = cudf::string_view{redaction.data(), redaction.size_bytes()};
+  }
+}
+
+/**
+ * @brief Frees the temporary individual string objects created in the
+ * redact_kernel
+ *
+ * Like malloc(), free() is not very efficient but must be called for
+ * each malloc() to return the memory to the CUDA malloc heap.
+ *
+ * @param redaction Redacted string replacement (not to be freed)
+ * @param d_output Output array of string_view objects to free
+ */
+__global__ void free_kernel(cudf::string_view redaction, cudf::string_view* d_output, int count)
+{
+  auto index = threadIdx.x + blockIdx.x * blockDim.x;
+  if (index >= count) return;
+
+  auto ptr = const_cast<char*>(d_output[index].data());
+  if (ptr != redaction.data()) { free(ptr); }
+}
+
+std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
+                                             cudf::column_view const& visibilities)
+{
+  // all device memory operations and kernel functions will run on this stream
+  auto stream = rmm::cuda_stream_default;
+
+  set_malloc_heap_size();  // to illustrate adjusting the malloc heap
+
+  auto const d_names        = cudf::column_device_view::create(names, stream);
+  auto const d_visibilities = cudf::column_device_view::create(visibilities, stream);
+  auto const d_redaction    = cudf::string_scalar(std::string("X X"), true, stream);
+
+  constexpr int block_size = 128;  // this arbitrary size should be a power of 2
+  auto const blocks        = (names.size() + block_size - 1) / block_size;
+
+  nvtxRangePushA("redact_strings");
+
+  // create a vector for the output strings' pointers
+  auto str_ptrs = new rmm::device_uvector<cudf::string_view>(names.size(), stream);
+
+  auto result = [&] {
+    // build the output strings
+    redact_kernel<<<blocks, block_size, 0, stream.value()>>>(
+      *d_names, *d_visibilities, d_redaction.value(), str_ptrs->data());
+    // create strings column from the string_view vector
+    // this copies all the individual strings into a single output column
+    return cudf::make_strings_column(*str_ptrs, cudf::string_view{nullptr, 0}, stream);
+  }();
+
+  // free the individual temporary memory pointers
+  free_kernel<<<blocks, block_size, 0, stream.value()>>>(
+    d_redaction.value(), str_ptrs->data(), names.size());
+  delete str_ptrs;
+
+  // wait for all of the above to finish
+  stream.synchronize();
+
+  nvtxRangePop();
+  return result;
+}
diff --git a/cpp/examples/strings/libcudf_apis.cpp b/cpp/examples/strings/libcudf_apis.cpp
new file mode 100644
index 00000000000..009e92d8a0d
--- /dev/null
+++ b/cpp/examples/strings/libcudf_apis.cpp
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "common.hpp"
+
+#include <cudf/copying.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/combine.hpp>
+#include <cudf/strings/find.hpp>
+#include <cudf/strings/split/split.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/substring.hpp>
+
+#include <cuda_runtime.h>
+#include <nvToolsExt.h>
+
+/**
+ * @brief Redacts each name per the corresponding visibility entry
+ *
+ * This implementation uses libcudf APIs to create the output result.
+ *
+ * @param names Column of names
+ * @param visibilities Column of visibilities
+ * @return Redacted column of names
+ */
+std::unique_ptr<cudf::column> redact_strings(cudf::column_view const& names,
+                                             cudf::column_view const& visibilities)
+{
+  auto const visible   = cudf::string_scalar(std::string("public"));
+  auto const redaction = cudf::string_scalar(std::string("X X"));
+
+  nvtxRangePushA("redact_strings");
+
+  auto const allowed      = cudf::strings::contains(visibilities, visible);
+  auto const redacted     = cudf::copy_if_else(names, redaction, allowed->view());
+  auto const first_last   = cudf::strings::split(redacted->view());
+  auto const first        = first_last->view().column(0);
+  auto const last         = first_last->view().column(1);
+  auto const last_initial = cudf::strings::slice_strings(last, 0, 1);
+
+  auto const last_initial_first = cudf::table_view({last_initial->view(), first});
+
+  auto result = cudf::strings::concatenate(last_initial_first, std::string(" "));
+
+  cudaStreamSynchronize(0);
+
+  nvtxRangePop();
+  return result;
+}
diff --git a/cpp/examples/strings/names.csv b/cpp/examples/strings/names.csv
new file mode 100644
index 00000000000..77dca3e02af
--- /dev/null
+++ b/cpp/examples/strings/names.csv
@@ -0,0 +1,20 @@
+John Doe,public
+Jane Doe,private
+Billy Joe,private
+James James,public
+Michael Frederick,public
+Christopher Cheryl,public
+Jessica Autumn,public
+Matthew Tyrone,public
+Ashley Martha,public
+Jennifer Omar,public
+Joshua Lydia,public
+Amanda Jerome,public
+Daniel Theodore,public
+David Abby,public
+James Neil,public
+Robert Shawna,private
+John Sierra,private
+Joseph Nina,private
+Andrew Tammy,private
+Ryan Nikki,public
diff --git a/cpp/include/cudf/aggregation.hpp b/cpp/include/cudf/aggregation.hpp
index a26a0c7947b..d319041f8b1 100644
--- a/cpp/include/cudf/aggregation.hpp
+++ b/cpp/include/cudf/aggregation.hpp
@@ -515,9 +515,10 @@ std::unique_ptr<Base> make_collect_list_aggregation(
  * @return A COLLECT_SET aggregation object
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_collect_set_aggregation(null_policy null_handling = null_policy::INCLUDE,
-                                                   null_equality nulls_equal = null_equality::EQUAL,
-                                                   nan_equality nans_equal = nan_equality::UNEQUAL);
+std::unique_ptr<Base> make_collect_set_aggregation(
+  null_policy null_handling = null_policy::INCLUDE,
+  null_equality nulls_equal = null_equality::EQUAL,
+  nan_equality nans_equal   = nan_equality::ALL_EQUAL);
 
 /**
  * @brief Factory to create a LAG aggregation
@@ -588,8 +589,9 @@ std::unique_ptr<Base> make_merge_lists_aggregation();
  * @return A MERGE_SETS aggregation object
  */
 template <typename Base = aggregation>
-std::unique_ptr<Base> make_merge_sets_aggregation(null_equality nulls_equal = null_equality::EQUAL,
-                                                  nan_equality nans_equal = nan_equality::UNEQUAL);
+std::unique_ptr<Base> make_merge_sets_aggregation(
+  null_equality nulls_equal = null_equality::EQUAL,
+  nan_equality nans_equal   = nan_equality::ALL_EQUAL);
 
 /**
  * @brief Factory to create a MERGE_M2 aggregation
diff --git a/cpp/include/cudf/binaryop.hpp b/cpp/include/cudf/binaryop.hpp
index c82fd1b52a1..fabe0d86fc4 100644
--- a/cpp/include/cudf/binaryop.hpp
+++ b/cpp/include/cudf/binaryop.hpp
@@ -232,7 +232,7 @@ namespace binops {
 std::pair<rmm::device_buffer, size_type> scalar_col_valid_mask_and(
   column_view const& col,
   scalar const& s,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 namespace compiled {
@@ -255,7 +255,7 @@ void apply_sorting_struct_binary_op(mutable_column_view& out,
                                     bool is_lhs_scalar,
                                     bool is_rhs_scalar,
                                     binary_operator op,
-                                    rmm::cuda_stream_view stream = cudf::default_stream_value);
+                                    rmm::cuda_stream_view stream);
 }  // namespace detail
 }  // namespace compiled
 }  // namespace binops
diff --git a/cpp/include/cudf/column/column.hpp b/cpp/include/cudf/column/column.hpp
index c5f6d339ae9..c02991051d9 100644
--- a/cpp/include/cudf/column/column.hpp
+++ b/cpp/include/cudf/column/column.hpp
@@ -64,7 +64,7 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   column(column const& other,
-         rmm::cuda_stream_view stream        = cudf::default_stream_value,
+         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -109,6 +109,8 @@ class column {
    * @note This constructor is primarily intended for use in column factory
    * functions.
    *
+   * @throws cudf::logic_error if `size < 0`
+   *
    * @param[in] dtype The element type
    * @param[in] size The number of elements in the column
    * @param[in] data The column's data
@@ -133,6 +135,7 @@ class column {
       _null_count{null_count},
       _children{std::move(children)}
   {
+    CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
   }
 
   /**
@@ -146,7 +149,7 @@ class column {
    * @param mr Device memory resource to use for all device memory allocations
    */
   explicit column(column_view view,
-                  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -208,7 +211,7 @@ class column {
    */
   void set_null_mask(rmm::device_buffer const& new_null_mask,
                      size_type new_null_count     = UNKNOWN_NULL_COUNT,
-                     rmm::cuda_stream_view stream = cudf::default_stream_value);
+                     rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Updates the count of null elements.
diff --git a/cpp/include/cudf/column/column_device_view.cuh b/cpp/include/cudf/column/column_device_view.cuh
index 4f9a09fb621..1361866d0aa 100644
--- a/cpp/include/cudf/column/column_device_view.cuh
+++ b/cpp/include/cudf/column/column_device_view.cuh
@@ -821,7 +821,7 @@ class alignas(16) column_device_view : public detail::column_device_view_base {
    *`source_view` available in device memory.
    */
   static std::unique_ptr<column_device_view, std::function<void(column_device_view*)>> create(
-    column_view source_view, rmm::cuda_stream_view stream = cudf::default_stream_value);
+    column_view source_view, rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Destroy the `column_device_view` object.
@@ -974,7 +974,7 @@ class alignas(16) mutable_column_device_view : public detail::column_device_view
   static std::unique_ptr<mutable_column_device_view,
                          std::function<void(mutable_column_device_view*)>>
   create(mutable_column_view source_view,
-         rmm::cuda_stream_view stream = cudf::default_stream_value);
+         rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Returns pointer to the base device memory allocation casted to
diff --git a/cpp/include/cudf/column/column_factories.hpp b/cpp/include/cudf/column/column_factories.hpp
index 5c691d866bd..725faeae626 100644
--- a/cpp/include/cudf/column/column_factories.hpp
+++ b/cpp/include/cudf/column/column_factories.hpp
@@ -62,6 +62,7 @@ std::unique_ptr<column> make_empty_column(type_id id);
  *
  * @throws std::bad_alloc if device memory allocation fails
  * @throws cudf::logic_error if `type` is not a numeric type
+ * @throws cudf::logic_error if `size < 0`
  *
  * @param[in] type The desired numeric element type
  * @param[in] size The number of elements in the column
@@ -75,7 +76,7 @@ std::unique_ptr<column> make_numeric_column(
   data_type type,
   size_type size,
   mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -101,7 +102,7 @@ std::unique_ptr<column> make_numeric_column(
   size_type size,
   B&& null_mask,
   size_type null_count                = cudf::UNKNOWN_NULL_COUNT,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
@@ -119,6 +120,7 @@ std::unique_ptr<column> make_numeric_column(
  * @note The column's null count is determined by the requested null mask `state`.
  *
  * @throws cudf::logic_error if `type` is not a `fixed_point` type.
+ * @throws cudf::logic_error if `size < 0`
  *
  * @param[in] type The desired `fixed_point` element type.
  * @param[in] size The number of elements in the column.
@@ -132,7 +134,7 @@ std::unique_ptr<column> make_fixed_point_column(
   data_type type,
   size_type size,
   mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -157,7 +159,7 @@ std::unique_ptr<column> make_fixed_point_column(
   size_type size,
   B&& null_mask,
   size_type null_count                = cudf::UNKNOWN_NULL_COUNT,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
@@ -176,6 +178,7 @@ std::unique_ptr<column> make_fixed_point_column(
  *
  * @throws std::bad_alloc if device memory allocation fails
  * @throws cudf::logic_error if `type` is not a timestamp type
+ * @throws cudf::logic_error if `size < 0`
  *
  * @param[in] type The desired timestamp element type
  * @param[in] size The number of elements in the column
@@ -189,7 +192,7 @@ std::unique_ptr<column> make_timestamp_column(
   data_type type,
   size_type size,
   mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -215,7 +218,7 @@ std::unique_ptr<column> make_timestamp_column(
   size_type size,
   B&& null_mask,
   size_type null_count                = cudf::UNKNOWN_NULL_COUNT,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
@@ -234,6 +237,7 @@ std::unique_ptr<column> make_timestamp_column(
  *
  * @throws std::bad_alloc if device memory allocation fails
  * @throws cudf::logic_error if `type` is not a duration type
+ * @throws cudf::logic_error if `size < 0`
  *
  * @param[in] type The desired duration element type
  * @param[in] size The number of elements in the column
@@ -247,7 +251,7 @@ std::unique_ptr<column> make_duration_column(
   data_type type,
   size_type size,
   mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -273,7 +277,7 @@ std::unique_ptr<column> make_duration_column(
   size_type size,
   B&& null_mask,
   size_type null_count                = cudf::UNKNOWN_NULL_COUNT,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
@@ -292,6 +296,7 @@ std::unique_ptr<column> make_duration_column(
  *
  * @throws std::bad_alloc if device memory allocation fails
  * @throws cudf::logic_error if `type` is not a fixed width type
+ * @throws cudf::logic_error if `size < 0`
  *
  * @param[in] type The desired fixed width type
  * @param[in] size The number of elements in the column
@@ -305,7 +310,7 @@ std::unique_ptr<column> make_fixed_width_column(
   data_type type,
   size_type size,
   mask_state state                    = mask_state::UNALLOCATED,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -331,7 +336,7 @@ std::unique_ptr<column> make_fixed_width_column(
   size_type size,
   B&& null_mask,
   size_type null_count                = cudf::UNKNOWN_NULL_COUNT,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(is_fixed_width(type), "Invalid, non-fixed-width type.");
@@ -366,11 +371,11 @@ std::unique_ptr<column> make_fixed_width_column(
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  * @param[in] mr Device memory resource used for allocation of the column's `null_mask` and children
  * columns' device memory.
-  * @return Constructed strings column
+ * @return Constructed strings column
  */
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<thrust::pair<const char*, size_type> const> strings,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -402,7 +407,7 @@ std::unique_ptr<column> make_strings_column(
 std::unique_ptr<column> make_strings_column(
   cudf::device_span<string_view const> string_views,
   const string_view null_placeholder,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -438,7 +443,7 @@ std::unique_ptr<column> make_strings_column(
   cudf::device_span<size_type const> offsets,
   cudf::device_span<bitmask_type const> null_mask = {},
   size_type null_count                            = cudf::UNKNOWN_NULL_COUNT,
-  rmm::cuda_stream_view stream                    = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                    = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr             = rmm::mr::get_current_device_resource());
 
 /**
@@ -547,7 +552,7 @@ std::unique_ptr<cudf::column> make_lists_column(
   std::unique_ptr<column> child_column,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -578,7 +583,7 @@ std::unique_ptr<cudf::column> make_structs_column(
   std::vector<std::unique_ptr<column>>&& child_columns,
   size_type null_count,
   rmm::device_buffer&& null_mask,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -598,7 +603,7 @@ std::unique_ptr<cudf::column> make_structs_column(
 std::unique_ptr<column> make_column_from_scalar(
   scalar const& s,
   size_type size,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -618,7 +623,7 @@ std::unique_ptr<column> make_column_from_scalar(
 std::unique_ptr<column> make_dictionary_from_scalar(
   scalar const& s,
   size_type size,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/copying.hpp b/cpp/include/cudf/copying.hpp
index 1c3ca179d17..63c66335d2d 100644
--- a/cpp/include/cudf/copying.hpp
+++ b/cpp/include/cudf/copying.hpp
@@ -140,13 +140,12 @@ std::unique_ptr<column> reverse(
  * If the same index appears more than once in the scatter map, the result is
  * undefined.
  *
+ * If any values in `scatter_map` are outside of the interval [-n, n) where `n`
+ * is the number of rows in the `target` table, behavior is undefined.
+ *
  * A negative value `i` in the `scatter_map` is interpreted as `i+n`, where `n`
  * is the number of rows in the `target` table.
  *
- * @throws cudf::logic_error if `check_bounds == true` and an index exists in
- * `scatter_map` outside the range `[-n, n)`, where `n` is the number of rows in
- * the target table. If `check_bounds == false`, the behavior is undefined.
- *
  * @param source The input columns containing values to be scattered into the
  * target columns
  * @param scatter_map A non-nullable column of integral indices that maps the
@@ -154,8 +153,6 @@ std::unique_ptr<column> reverse(
  * to or less than the number of elements in the source columns.
  * @param target The set of columns into which values from the source_table
  * are to be scattered
- * @param check_bounds Optionally perform bounds checking on the values of
- * `scatter_map` and throw an error if any of its values are out of bounds.
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
@@ -163,7 +160,6 @@ std::unique_ptr<table> scatter(
   table_view const& source,
   column_view const& scatter_map,
   table_view const& target,
-  bool check_bounds                   = false,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -184,9 +180,8 @@ std::unique_ptr<table> scatter(
  * If the same index appears more than once in the scatter map, the result is
  * undefined.
  *
- * @throws cudf::logic_error if `check_bounds == true` and an index exists in
- * `scatter_map` outside the range `[-n, n)`, where `n` is the number of rows in
- * the target table. If `check_bounds == false`, the behavior is undefined.
+ * If any values in `scatter_map` are outside of the interval [-n, n) where `n`
+ * is the number of rows in the `target` table, behavior is undefined.
  *
  * @param source The input scalars containing values to be scattered into the
  * target columns
@@ -194,8 +189,6 @@ std::unique_ptr<table> scatter(
  * the rows in the target table to be replaced by source.
  * @param target The set of columns into which values from the source_table
  * are to be scattered
- * @param check_bounds Optionally perform bounds checking on the values of
- * `scatter_map` and throw an error if any of its values are out of bounds.
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
  */
@@ -203,7 +196,6 @@ std::unique_ptr<table> scatter(
   std::vector<std::reference_wrapper<const scalar>> const& source,
   column_view const& indices,
   table_view const& target,
-  bool check_bounds                   = false,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -1020,12 +1012,19 @@ bool has_nonempty_nulls(column_view const& input);
 bool may_have_nonempty_nulls(column_view const& input);
 
 /**
- * @brief Copies `input`, purging any non-empty null rows in the column or its descendants
+ * @brief Copy `input` into output while purging any non-empty null rows in the column or its
+ * descendants.
  *
- * LIST columns may have non-empty null rows.
- * For example:
- * @code{.pseudo}
+ * If the input column is not of compound type (LIST/STRING/STRUCT/DICTIONARY), the output will be
+ * the same as input.
+ *
+ * The purge operation only applies directly to LIST and STRING columns, but it applies indirectly
+ * to STRUCT/DICTIONARY columns as well, since these columns may have child columns that
+ * are LIST or STRING.
  *
+ * Examples:
+ *
+ * @code{.pseudo}
  * auto const lists   = lists_column_wrapper<int32_t>{ {0,1}, {2,3}, {4,5} }.release();
  * cudf::detail::set_null_mask(lists->null_mask(), 1, 2, false);
  *
@@ -1035,33 +1034,13 @@ bool may_have_nonempty_nulls(column_view const& input);
  *   Offsets:  [0, 2, 4, 6]
  *   Child:    [0, 1, 2, 3, 4, 5]
  *
- * After purging the contents of the list's null rows, the column's contents
- * will be:
+ * After purging the contents of the list's null rows, the column's contents will be:
  *   Validity: 101
  *   Offsets:  [0, 2, 2, 4]
  *   Child:    [0, 1, 4, 5]
  * @endcode
  *
- * The purge operation only applies directly to LIST and STRING columns, but it
- * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns
- * may have child/descendant columns that are LIST or STRING.
- *
- * @param input The column whose null rows are to be checked and purged
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return std::unique_ptr<column> Column with equivalent contents to `input`, but with
- * the contents of null rows purged
- */
-std::unique_ptr<column> purge_nonempty_nulls(
-  lists_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Copies `input`, purging any non-empty null rows in the column or its descendants
- *
- * STRING columns may have non-empty null rows.
- * For example:
  * @code{.pseudo}
- *
  * auto const strings = strings_column_wrapper{ "AB", "CD", "EF" }.release();
  * cudf::detail::set_null_mask(strings->null_mask(), 1, 2, false);
  *
@@ -1078,26 +1057,7 @@ std::unique_ptr<column> purge_nonempty_nulls(
  *   Child:    [A, B, E, F]
  * @endcode
  *
- * The purge operation only applies directly to LIST and STRING columns, but it
- * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns
- * may have child/descendant columns that are LIST or STRING.
- *
- * @param input The column whose null rows are to be checked and purged
- * @param mr Device memory resource used to allocate the returned column's device memory
- * @return std::unique_ptr<column> Column with equivalent contents to `input`, but with
- * the contents of null rows purged
- */
-std::unique_ptr<column> purge_nonempty_nulls(
-  strings_column_view const& input,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
-
-/**
- * @brief Copies `input`, purging any non-empty null rows in the column or its descendants
- *
- * STRUCTS columns may have null rows, with non-empty child rows.
- * For example:
  * @code{.pseudo}
- *
  * auto const lists   = lists_column_wrapper<int32_t>{ {0,1}, {2,3}, {4,5} };
  * auto const structs = structs_column_wrapper{ {lists}, null_at(1) };
  *
@@ -1114,17 +1074,12 @@ std::unique_ptr<column> purge_nonempty_nulls(
  *   Child:    [0, 1, 4, 5]
  * @endcode
  *
- * The purge operation only applies directly to LIST and STRING columns, but it
- * applies indirectly to STRUCT columns as well, since LIST and STRUCT columns
- * may have child/descendant columns that are LIST or STRING.
- *
  * @param input The column whose null rows are to be checked and purged
  * @param mr Device memory resource used to allocate the returned column's device memory
- * @return std::unique_ptr<column> Column with equivalent contents to `input`, but with
- * the contents of null rows purged
+ * @return A new column with equivalent contents to `input`, but with null rows purged
  */
 std::unique_ptr<column> purge_nonempty_nulls(
-  structs_column_view const& input,
+  column_view const& input,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */
diff --git a/cpp/include/cudf/datetime.hpp b/cpp/include/cudf/datetime.hpp
index a8955ffb17c..fb04336871f 100644
--- a/cpp/include/cudf/datetime.hpp
+++ b/cpp/include/cudf/datetime.hpp
@@ -36,7 +36,7 @@ namespace datetime {
  */
 
 /**
- * @brief  Extracts year from any date time type and returns an int16_t
+ * @brief  Extracts year from any datetime type and returns an int16_t
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
@@ -50,7 +50,7 @@ std::unique_ptr<cudf::column> extract_year(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief  Extracts month from any date time type and returns an int16_t
+ * @brief  Extracts month from any datetime type and returns an int16_t
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
@@ -64,7 +64,7 @@ std::unique_ptr<cudf::column> extract_month(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief  Extracts day from any date time type and returns an int16_t
+ * @brief  Extracts day from any datetime type and returns an int16_t
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
@@ -78,7 +78,7 @@ std::unique_ptr<cudf::column> extract_day(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief  Extracts day from any date time type and returns an int16_t
+ * @brief  Extracts day from any datetime type and returns an int16_t
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
@@ -92,7 +92,7 @@ std::unique_ptr<cudf::column> extract_weekday(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief  Extracts hour from any date time type and returns an int16_t
+ * @brief  Extracts hour from any datetime type and returns an int16_t
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
@@ -106,7 +106,7 @@ std::unique_ptr<cudf::column> extract_hour(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief  Extracts minute from any date time type and returns an int16_t
+ * @brief  Extracts minute from any datetime type and returns an int16_t
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
@@ -120,7 +120,7 @@ std::unique_ptr<cudf::column> extract_minute(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief  Extracts second from any date time type and returns an int16_t
+ * @brief  Extracts second from any datetime type and returns an int16_t
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
@@ -133,6 +133,57 @@ std::unique_ptr<cudf::column> extract_second(
   cudf::column_view const& column,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief  Extracts millisecond fraction from any datetime type and returns an int16_t
+ * cudf::column.
+ *
+ * A millisecond fraction is only the 3 digits that make up the millisecond portion of a duration.
+ * For example, the millisecond fraction of 1.234567890 seconds is 234.
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column
+ *
+ * @returns cudf::column of the extracted int16_t milliseconds
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ */
+std::unique_ptr<cudf::column> extract_millisecond_fraction(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief  Extracts microsecond fraction from any datetime type and returns an int16_t
+ * cudf::column.
+ *
+ * A microsecond fraction is only the 3 digits that make up the microsecond portion of a duration.
+ * For example, the microsecond fraction of 1.234567890 seconds is 567.
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column
+ *
+ * @returns cudf::column of the extracted int16_t microseconds
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ */
+std::unique_ptr<cudf::column> extract_microsecond_fraction(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @brief  Extracts nanosecond fraction from any datetime type and returns an int16_t
+ * cudf::column.
+ *
+ * A nanosecond fraction is only the 3 digits that make up the nanosecond portion of a duration.
+ * For example, the nanosecond fraction of 1.234567890 seconds is 890.
+ *
+ * @param column cudf::column_view of the input datetime values
+ * @param mr Device memory resource used to allocate device memory of the returned column
+ *
+ * @returns cudf::column of the extracted int16_t nanoseconds
+ * @throw cudf::logic_error if input column datatype is not TIMESTAMP
+ */
+std::unique_ptr<cudf::column> extract_nanosecond_fraction(
+  cudf::column_view const& column,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of group
 /**
  * @addtogroup datetime_compute
@@ -141,7 +192,7 @@ std::unique_ptr<cudf::column> extract_second(
  */
 
 /**
- * @brief  Computes the last day of the month in date time type and returns a TIMESTAMP_DAYS
+ * @brief  Computes the last day of the month in datetime type and returns a TIMESTAMP_DAYS
  * cudf::column.
  *
  * @param column cudf::column_view of the input datetime values
@@ -169,7 +220,7 @@ std::unique_ptr<cudf::column> day_of_year(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief  Adds or subtracts a number of months from the date time type and returns a
+ * @brief  Adds or subtracts a number of months from the datetime type and returns a
  * timestamp column that is of the same type as the input `timestamps` column.
  *
  * For a given row, if the `timestamps` or the `months` column value is null,
@@ -204,7 +255,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
- * @brief  Adds or subtracts a number of months from the date time type and returns a
+ * @brief  Adds or subtracts a number of months from the datetime type and returns a
  * timestamp column that is of the same type as the input `timestamps` column.
  *
  * For a given row, if the `timestamps` value is null, the output for that row is null.
diff --git a/cpp/include/cudf/detail/binaryop.hpp b/cpp/include/cudf/detail/binaryop.hpp
index 8deac88a645..ffd8be971ab 100644
--- a/cpp/include/cudf/detail/binaryop.hpp
+++ b/cpp/include/cudf/detail/binaryop.hpp
@@ -35,7 +35,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   std::string const& ptx,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -49,7 +49,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -63,7 +63,7 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -77,7 +77,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/concatenate.hpp b/cpp/include/cudf/detail/concatenate.hpp
index 08a37acead2..925029597a6 100644
--- a/cpp/include/cudf/detail/concatenate.hpp
+++ b/cpp/include/cudf/detail/concatenate.hpp
@@ -35,7 +35,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(
   host_span<column_view const> columns_to_concat,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -45,7 +45,7 @@ std::unique_ptr<column> concatenate(
  */
 std::unique_ptr<table> concatenate(
   host_span<table_view const> tables_to_concat,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/copy.cuh b/cpp/include/cudf/detail/copy.cuh
deleted file mode 100644
index 348f629a51a..00000000000
--- a/cpp/include/cudf/detail/copy.cuh
+++ /dev/null
@@ -1,50 +0,0 @@
-/*
- * Copyright (c) 2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-#pragma once
-
-#include <cudf/detail/copy.hpp>
-#include <cudf/detail/gather.cuh>
-
-#include <thrust/iterator/counting_iterator.h>
-
-namespace cudf::detail {
-
-/**
- * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*)
- *
- * @tparam ColumnViewT View type (lists_column_view, strings_column_view, or strings_column_view)
- * @param stream CUDA stream used for device memory operations and kernel launches
- */
-template <typename ColumnViewT>
-std::unique_ptr<cudf::column> purge_nonempty_nulls(ColumnViewT const& input,
-                                                   rmm::cuda_stream_view stream,
-                                                   rmm::mr::device_memory_resource* mr)
-{
-  // Implement via identity gather.
-  auto const input_column = input.parent();
-  auto const gather_begin = thrust::counting_iterator<cudf::size_type>(0);
-  auto const gather_end   = gather_begin + input_column.size();
-
-  auto gathered_table = cudf::detail::gather(table_view{{input_column}},
-                                             gather_begin,
-                                             gather_end,
-                                             out_of_bounds_policy::DONT_CHECK,
-                                             stream,
-                                             mr);
-  return std::move(gathered_table->release()[0]);
-}
-
-}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/copy.hpp b/cpp/include/cudf/detail/copy.hpp
index a2cbe8c5238..8c3f315284d 100644
--- a/cpp/include/cudf/detail/copy.hpp
+++ b/cpp/include/cudf/detail/copy.hpp
@@ -77,7 +77,7 @@ ColumnView slice(ColumnView const& input, cudf::size_type begin, cudf::size_type
  */
 std::vector<column_view> slice(column_view const& input,
                                host_span<size_type const> indices,
-                               rmm::cuda_stream_view stream = cudf::default_stream_value);
+                               rmm::cuda_stream_view stream);
 /**
  * @copydoc cudf::slice(column_view const&, std::initializer_list<size_type>)
  *
@@ -85,7 +85,7 @@ std::vector<column_view> slice(column_view const& input,
  */
 std::vector<column_view> slice(column_view const& input,
                                std::initializer_list<size_type> indices,
-                               rmm::cuda_stream_view stream = cudf::default_stream_value);
+                               rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::slice(table_view const&, host_span<size_type const>)
@@ -94,7 +94,7 @@ std::vector<column_view> slice(column_view const& input,
  */
 std::vector<table_view> slice(table_view const& input,
                               host_span<size_type const> indices,
-                              rmm::cuda_stream_view stream = cudf::default_stream_value);
+                              rmm::cuda_stream_view stream);
 /**
  * @copydoc cudf::slice(table_view const&, std::initializer_list<size_type>)
  *
@@ -102,7 +102,7 @@ std::vector<table_view> slice(table_view const& input,
  */
 std::vector<table_view> slice(table_view const& input,
                               std::initializer_list<size_type> indices,
-                              rmm::cuda_stream_view stream = cudf::default_stream_value);
+                              rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::split(column_view const&, host_span<size_type const>)
@@ -111,7 +111,7 @@ std::vector<table_view> slice(table_view const& input,
  */
 std::vector<column_view> split(column_view const& input,
                                host_span<size_type const> splits,
-                               rmm::cuda_stream_view stream = cudf::default_stream_value);
+                               rmm::cuda_stream_view stream);
 /**
  * @copydoc cudf::split(column_view const&, std::initializer_list<size_type>)
  *
@@ -119,7 +119,7 @@ std::vector<column_view> split(column_view const& input,
  */
 std::vector<column_view> split(column_view const& input,
                                std::initializer_list<size_type> splits,
-                               rmm::cuda_stream_view stream = cudf::default_stream_value);
+                               rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::split(table_view const&, host_span<size_type const>)
@@ -128,7 +128,7 @@ std::vector<column_view> split(column_view const& input,
  */
 std::vector<table_view> split(table_view const& input,
                               host_span<size_type const> splits,
-                              rmm::cuda_stream_view stream = cudf::default_stream_value);
+                              rmm::cuda_stream_view stream);
 /**
  * @copydoc cudf::split(table_view const&, std::initializer_list<size_type>)
  *
@@ -136,7 +136,7 @@ std::vector<table_view> split(table_view const& input,
  */
 std::vector<table_view> split(table_view const& input,
                               std::initializer_list<size_type> splits,
-                              rmm::cuda_stream_view stream = cudf::default_stream_value);
+                              rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::shift(column_view const&,size_type,scalar const&,
@@ -148,7 +148,7 @@ std::unique_ptr<column> shift(
   column_view const& input,
   size_type offset,
   scalar const& fill_value,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -189,7 +189,7 @@ std::unique_ptr<column> segmented_shift(
   device_span<size_type const> segment_offsets,
   size_type offset,
   scalar const& fill_value,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -200,7 +200,7 @@ std::unique_ptr<column> segmented_shift(
 std::vector<packed_table> contiguous_split(
   cudf::table_view const& input,
   std::vector<size_type> const& splits,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -209,7 +209,7 @@ std::vector<packed_table> contiguous_split(
  * @param stream Optional CUDA stream on which to execute kernels
  **/
 packed_columns pack(cudf::table_view const& input,
-                    rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                    rmm::cuda_stream_view stream,
                     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -222,7 +222,7 @@ std::unique_ptr<column> allocate_like(
   column_view const& input,
   size_type size,
   mask_allocation_policy mask_alloc   = mask_allocation_policy::RETAIN,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -235,7 +235,7 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -248,7 +248,7 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   column_view const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -261,7 +261,7 @@ std::unique_ptr<column> copy_if_else(
   column_view const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -274,7 +274,7 @@ std::unique_ptr<column> copy_if_else(
   scalar const& lhs,
   scalar const& rhs,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -287,7 +287,7 @@ std::unique_ptr<table> sample(
   size_type const n,
   sample_with_replacement replacement = sample_with_replacement::FALSE,
   int64_t const seed                  = 0,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -298,7 +298,7 @@ std::unique_ptr<table> sample(
 std::unique_ptr<scalar> get_element(
   column_view const& input,
   size_type index,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -306,16 +306,24 @@ std::unique_ptr<scalar> get_element(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-bool has_nonempty_nulls(column_view const& input,
-                        rmm::cuda_stream_view stream = cudf::default_stream_value);
+bool has_nonempty_nulls(column_view const& input, rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::may_have_nonempty_nulls
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-bool may_have_nonempty_nulls(column_view const& input,
-                             rmm::cuda_stream_view stream = cudf::default_stream_value);
+bool may_have_nonempty_nulls(column_view const& input, rmm::cuda_stream_view stream);
+
+/**
+ * @copydoc cudf::purge_nonempty_nulls
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<column> purge_nonempty_nulls(
+  column_view const& input,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/copy_if.cuh b/cpp/include/cudf/detail/copy_if.cuh
index 99d9f5181c7..6eea72a1e0d 100644
--- a/cpp/include/cudf/detail/copy_if.cuh
+++ b/cpp/include/cudf/detail/copy_if.cuh
@@ -323,7 +323,7 @@ template <typename Filter>
 std::unique_ptr<table> copy_if(
   table_view const& input,
   Filter filter,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_FUNC_RANGE();
diff --git a/cpp/include/cudf/detail/copy_range.cuh b/cpp/include/cudf/detail/copy_range.cuh
index aaba729f2f2..22714e97dfa 100644
--- a/cpp/include/cudf/detail/copy_range.cuh
+++ b/cpp/include/cudf/detail/copy_range.cuh
@@ -135,7 +135,7 @@ void copy_range(SourceValueIterator source_value_begin,
                 mutable_column_view& target,
                 size_type target_begin,
                 size_type target_end,
-                rmm::cuda_stream_view stream = cudf::default_stream_value)
+                rmm::cuda_stream_view stream)
 {
   CUDF_EXPECTS((target_begin <= target_end) && (target_begin >= 0) &&
                  (target_begin < target.size()) && (target_end <= target.size()),
@@ -196,7 +196,7 @@ void copy_range_in_place(column_view const& source,
                          size_type source_begin,
                          size_type source_end,
                          size_type target_begin,
-                         rmm::cuda_stream_view stream = cudf::default_stream_value);
+                         rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::copy_range
@@ -209,7 +209,7 @@ std::unique_ptr<column> copy_range(
   size_type source_begin,
   size_type source_end,
   size_type target_begin,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/datetime.hpp b/cpp/include/cudf/detail/datetime.hpp
index 7a2545fbdcf..c2e3c32b65f 100644
--- a/cpp/include/cudf/detail/datetime.hpp
+++ b/cpp/include/cudf/detail/datetime.hpp
@@ -31,7 +31,7 @@ namespace detail {
  */
 std::unique_ptr<cudf::column> extract_year(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -41,7 +41,7 @@ std::unique_ptr<cudf::column> extract_year(
  */
 std::unique_ptr<cudf::column> extract_month(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -51,7 +51,7 @@ std::unique_ptr<cudf::column> extract_month(
  */
 std::unique_ptr<cudf::column> extract_day(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -61,7 +61,7 @@ std::unique_ptr<cudf::column> extract_day(
  */
 std::unique_ptr<cudf::column> extract_weekday(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -71,7 +71,7 @@ std::unique_ptr<cudf::column> extract_weekday(
  */
 std::unique_ptr<cudf::column> extract_hour(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -81,7 +81,7 @@ std::unique_ptr<cudf::column> extract_hour(
  */
 std::unique_ptr<cudf::column> extract_minute(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -91,7 +91,40 @@ std::unique_ptr<cudf::column> extract_minute(
  */
 std::unique_ptr<cudf::column> extract_second(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::extract_millisecond_fraction(cudf::column_view const&,
+ * rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<cudf::column> extract_millisecond_fraction(
+  cudf::column_view const& column,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::extract_microsecond_fraction(cudf::column_view const&,
+ * rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<cudf::column> extract_microsecond_fraction(
+  cudf::column_view const& column,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+/**
+ * @copydoc cudf::extract_nanosecond_fraction(cudf::column_view const&,
+ * rmm::mr::device_memory_resource *)
+ *
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+std::unique_ptr<cudf::column> extract_nanosecond_fraction(
+  cudf::column_view const& column,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -101,7 +134,7 @@ std::unique_ptr<cudf::column> extract_second(
  */
 std::unique_ptr<cudf::column> last_day_of_month(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -111,7 +144,7 @@ std::unique_ptr<cudf::column> last_day_of_month(
  */
 std::unique_ptr<cudf::column> day_of_year(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -123,7 +156,7 @@ std::unique_ptr<cudf::column> day_of_year(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::column_view const& months,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -135,7 +168,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(
 std::unique_ptr<cudf::column> add_calendrical_months(
   cudf::column_view const& timestamps,
   cudf::scalar const& months,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -145,12 +178,12 @@ std::unique_ptr<cudf::column> add_calendrical_months(
  */
 std::unique_ptr<cudf::column> is_leap_year(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> extract_quarter(
   cudf::column_view const& column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/fill.hpp b/cpp/include/cudf/detail/fill.hpp
index f236fa7fd43..e34acfff6b9 100644
--- a/cpp/include/cudf/detail/fill.hpp
+++ b/cpp/include/cudf/detail/fill.hpp
@@ -36,7 +36,7 @@ void fill_in_place(mutable_column_view& destination,
                    size_type begin,
                    size_type end,
                    scalar const& value,
-                   rmm::cuda_stream_view stream = cudf::default_stream_value);
+                   rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::fill
@@ -48,7 +48,7 @@ std::unique_ptr<column> fill(
   size_type begin,
   size_type end,
   scalar const& value,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/gather.cuh b/cpp/include/cudf/detail/gather.cuh
index 8bb117c3dd0..57d834e6277 100644
--- a/cpp/include/cudf/detail/gather.cuh
+++ b/cpp/include/cudf/detail/gather.cuh
@@ -128,7 +128,7 @@ void gather_helper(InputItr source_itr,
 {
   using map_type = typename std::iterator_traits<MapIterator>::value_type;
   if (nullify_out_of_bounds) {
-    thrust::gather_if(rmm::exec_policy(stream),
+    thrust::gather_if(rmm::exec_policy_nosync(stream),
                       gather_map_begin,
                       gather_map_end,
                       gather_map_begin,
@@ -137,7 +137,7 @@ void gather_helper(InputItr source_itr,
                       bounds_checker<map_type>{0, source_size});
   } else {
     thrust::gather(
-      rmm::exec_policy(stream), gather_map_begin, gather_map_end, source_itr, target_itr);
+      rmm::exec_policy_nosync(stream), gather_map_begin, gather_map_end, source_itr, target_itr);
   }
 }
 
@@ -652,7 +652,7 @@ std::unique_ptr<table> gather(
   MapIterator gather_map_begin,
   MapIterator gather_map_end,
   out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   std::vector<std::unique_ptr<column>> destination_columns;
diff --git a/cpp/include/cudf/detail/gather.hpp b/cpp/include/cudf/detail/gather.hpp
index fccad73591e..9d61a8de184 100644
--- a/cpp/include/cudf/detail/gather.hpp
+++ b/cpp/include/cudf/detail/gather.hpp
@@ -66,7 +66,7 @@ std::unique_ptr<table> gather(
   column_view const& gather_map,
   out_of_bounds_policy bounds_policy,
   negative_index_policy neg_indices,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -81,7 +81,7 @@ std::unique_ptr<table> gather(
   device_span<size_type const> const gather_map,
   out_of_bounds_policy bounds_policy,
   negative_index_policy neg_indices,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
index faf92c996d1..9e64048b7b4 100644
--- a/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
+++ b/cpp/include/cudf/detail/groupby/group_replace_nulls.hpp
@@ -40,7 +40,7 @@ std::unique_ptr<column> group_replace_nulls(
   cudf::column_view const& grouped_value,
   device_span<size_type const> group_labels,
   cudf::replace_policy replace_policy,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/groupby/sort_helper.hpp b/cpp/include/cudf/detail/groupby/sort_helper.hpp
index 8705bbd29cb..a5060cd3d36 100644
--- a/cpp/include/cudf/detail/groupby/sort_helper.hpp
+++ b/cpp/include/cudf/detail/groupby/sort_helper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -218,8 +218,6 @@ struct sort_groupby_helper {
   column_ptr _unsorted_keys_labels;  ///< Group labels for unsorted _keys
   column_ptr _keys_bitmask_column;   ///< Column representing rows with one or more nulls values
   table_view _keys;                  ///< Input keys to sort by
-  table_view _unflattened_keys;      ///< Input keys, unflattened and possibly nested
-  structs::detail::flattened_table _flattened;  ///< Support datastructures for _keys
 
   index_vector_ptr
     _group_offsets;  ///< Indices into sorted _keys indicating starting index of each groups
diff --git a/cpp/include/cudf/detail/hashing.hpp b/cpp/include/cudf/detail/hashing.hpp
index 66cbf24e607..b7469d80a8d 100644
--- a/cpp/include/cudf/detail/hashing.hpp
+++ b/cpp/include/cudf/detail/hashing.hpp
@@ -35,24 +35,24 @@ std::unique_ptr<column> hash(
   table_view const& input,
   hash_id hash_function               = hash_id::HASH_MURMUR3,
   uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> murmur_hash3_32(
   table_view const& input,
   uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> spark_murmur_hash3_32(
   table_view const& input,
   uint32_t seed                       = cudf::DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> md5_hash(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /* Copyright 2005-2014 Daniel James.
diff --git a/cpp/include/cudf/detail/interop.hpp b/cpp/include/cudf/detail/interop.hpp
index 1417be358de..5a5bbe7f683 100644
--- a/cpp/include/cudf/detail/interop.hpp
+++ b/cpp/include/cudf/detail/interop.hpp
@@ -16,7 +16,13 @@
 
 #pragma once
 
+// We disable warning 611 because the `arrow::TableBatchReader` only partially
+// override the `ReadNext` method of `arrow::RecordBatchReader::ReadNext`
+// triggering warning 611-D from nvcc.
+#pragma nv_diag_suppress 611
 #include <arrow/api.h>
+#pragma nv_diag_default 611
+
 #include <cudf/interop.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
@@ -34,7 +40,7 @@ namespace detail {
  */
 std::unique_ptr<table> from_dlpack(
   DLManagedTensor const* managed_tensor,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -44,7 +50,7 @@ std::unique_ptr<table> from_dlpack(
  */
 DLManagedTensor* to_dlpack(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 // Creating arrow as per given type_id and buffer arguments
@@ -104,7 +110,7 @@ data_type arrow_to_cudf_type(arrow::DataType const& arrow_type);
  */
 std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        std::vector<column_metadata> const& metadata = {},
-                                       rmm::cuda_stream_view stream = cudf::default_stream_value,
+                                       rmm::cuda_stream_view stream = cudf::get_default_stream(),
                                        arrow::MemoryPool* ar_mr     = arrow::default_memory_pool());
 
 /**
@@ -114,7 +120,7 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
  */
 std::unique_ptr<table> from_arrow(
   arrow::Table const& input_table,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/is_element_valid.hpp b/cpp/include/cudf/detail/is_element_valid.hpp
index f9f42bdae1d..72a85d42eb3 100644
--- a/cpp/include/cudf/detail/is_element_valid.hpp
+++ b/cpp/include/cudf/detail/is_element_valid.hpp
@@ -41,7 +41,7 @@ namespace detail {
 
 bool is_element_valid_sync(column_view const& col_view,
                            size_type element_index,
-                           rmm::cuda_stream_view stream = cudf::default_stream_value);
+                           rmm::cuda_stream_view stream);
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/join.hpp b/cpp/include/cudf/detail/join.hpp
index a0385674f36..2dfe31091ac 100644
--- a/cpp/include/cudf/detail/join.hpp
+++ b/cpp/include/cudf/detail/join.hpp
@@ -91,7 +91,7 @@ struct hash_join {
    */
   hash_join(cudf::table_view const& build,
             cudf::null_equality compare_nulls,
-            rmm::cuda_stream_view stream = cudf::default_stream_value);
+            rmm::cuda_stream_view stream);
 
   /**
    * @copydoc cudf::hash_join::inner_join
diff --git a/cpp/include/cudf/detail/label_bins.hpp b/cpp/include/cudf/detail/label_bins.hpp
index 846893b70f6..f556c81c371 100644
--- a/cpp/include/cudf/detail/label_bins.hpp
+++ b/cpp/include/cudf/detail/label_bins.hpp
@@ -51,7 +51,7 @@ std::unique_ptr<column> label_bins(
   inclusive left_inclusive,
   column_view const& right_edges,
   inclusive right_inclusive,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/detail/null_mask.hpp b/cpp/include/cudf/detail/null_mask.hpp
index 5d4f62e0feb..a0e04d7b215 100644
--- a/cpp/include/cudf/detail/null_mask.hpp
+++ b/cpp/include/cudf/detail/null_mask.hpp
@@ -34,7 +34,7 @@ namespace detail {
 rmm::device_buffer create_null_mask(
   size_type size,
   mask_state state,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -46,7 +46,7 @@ void set_null_mask(bitmask_type* bitmask,
                    size_type begin_bit,
                    size_type end_bit,
                    bool valid,
-                   rmm::cuda_stream_view stream = cudf::default_stream_value);
+                   rmm::cuda_stream_view stream);
 
 /**
  * @brief Given a bitmask, counts the number of set (1) bits in the range
diff --git a/cpp/include/cudf/detail/nvtx/nvtx3.hpp b/cpp/include/cudf/detail/nvtx/nvtx3.hpp
index fb90ea668f5..c77714181ef 100644
--- a/cpp/include/cudf/detail/nvtx/nvtx3.hpp
+++ b/cpp/include/cudf/detail/nvtx/nvtx3.hpp
@@ -1907,7 +1907,7 @@ inline void mark(event_attributes const& attr) noexcept
 #define NVTX3_FUNC_RANGE_IN(D)                                                 \
   static ::nvtx3::registered_message<D> const nvtx3_func_name__{__func__};     \
   static ::nvtx3::event_attributes const nvtx3_func_attr__{nvtx3_func_name__}; \
-  ::nvtx3::domain_thread_range<D> const nvtx3_range__{nvtx3_func_attr__};
+  [[maybe_unused]] ::nvtx3::domain_thread_range<D> const nvtx3_range__{nvtx3_func_attr__};
 
 /**
  * @brief Convenience macro for generating a range in the global domain from the
diff --git a/cpp/include/cudf/detail/quantiles.hpp b/cpp/include/cudf/detail/quantiles.hpp
index 82b8ff35bfc..3764b03641e 100644
--- a/cpp/include/cudf/detail/quantiles.hpp
+++ b/cpp/include/cudf/detail/quantiles.hpp
@@ -16,7 +16,7 @@
 #pragma once
 
 #include <cudf/quantiles.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -35,7 +35,7 @@ std::unique_ptr<column> quantile(
   interpolation interp                = interpolation::LINEAR,
   column_view const& ordered_indices  = {},
   bool exact                          = true,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -50,7 +50,7 @@ std::unique_ptr<table> quantiles(
   cudf::sorted is_input_sorted                   = sorted::NO,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::cuda_stream_view stream                   = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -62,7 +62,7 @@ std::unique_ptr<table> quantiles(
 std::unique_ptr<column> percentile_approx(
   tdigest::tdigest_column_view const& input,
   column_view const& percentiles,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/reduction_functions.hpp b/cpp/include/cudf/detail/reduction_functions.hpp
index 7877fe13951..a2de286f283 100644
--- a/cpp/include/cudf/detail/reduction_functions.hpp
+++ b/cpp/include/cudf/detail/reduction_functions.hpp
@@ -46,7 +46,7 @@ std::unique_ptr<scalar> sum(
   column_view const& col,
   data_type const output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -67,7 +67,7 @@ std::unique_ptr<scalar> min(
   column_view const& col,
   data_type const output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -88,7 +88,7 @@ std::unique_ptr<scalar> max(
   column_view const& col,
   data_type const output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -110,7 +110,7 @@ std::unique_ptr<scalar> any(
   column_view const& col,
   data_type const output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -132,7 +132,7 @@ std::unique_ptr<scalar> all(
   column_view const& col,
   data_type const output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -154,7 +154,7 @@ std::unique_ptr<scalar> product(
   column_view const& col,
   data_type const output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -174,7 +174,7 @@ std::unique_ptr<scalar> product(
 std::unique_ptr<scalar> sum_of_squares(
   column_view const& col,
   data_type const output_dtype,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -194,7 +194,7 @@ std::unique_ptr<scalar> sum_of_squares(
 std::unique_ptr<scalar> mean(
   column_view const& col,
   data_type const output_dtype,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -217,7 +217,7 @@ std::unique_ptr<scalar> variance(
   column_view const& col,
   data_type const output_dtype,
   cudf::size_type ddof,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -240,7 +240,7 @@ std::unique_ptr<scalar> standard_deviation(
   column_view const& col,
   data_type const output_dtype,
   cudf::size_type ddof,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -271,7 +271,7 @@ std::unique_ptr<scalar> nth_element(
   column_view const& col,
   size_type n,
   null_policy null_handling,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -286,7 +286,7 @@ std::unique_ptr<scalar> nth_element(
 std::unique_ptr<scalar> collect_list(
   column_view const& col,
   null_policy null_handling,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -299,7 +299,7 @@ std::unique_ptr<scalar> collect_list(
  */
 std::unique_ptr<scalar> merge_lists(
   lists_column_view const& col,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -318,7 +318,7 @@ std::unique_ptr<scalar> collect_set(
   null_policy null_handling,
   null_equality nulls_equal,
   nan_equality nans_equal,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -335,7 +335,7 @@ std::unique_ptr<scalar> merge_sets(
   lists_column_view const& col,
   null_equality nulls_equal,
   nan_equality nans_equal,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -363,7 +363,7 @@ std::unique_ptr<column> segmented_sum(
   data_type const output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -391,7 +391,7 @@ std::unique_ptr<column> segmented_product(
   data_type const output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -418,7 +418,7 @@ std::unique_ptr<column> segmented_min(
   data_type const output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -445,7 +445,7 @@ std::unique_ptr<column> segmented_max(
   data_type const output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -473,7 +473,7 @@ std::unique_ptr<column> segmented_any(
   data_type const output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -501,7 +501,7 @@ std::unique_ptr<column> segmented_all(
   data_type const output_dtype,
   null_policy null_handling,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace reduction
diff --git a/cpp/include/cudf/detail/repeat.hpp b/cpp/include/cudf/detail/repeat.hpp
index 9bd03878579..69d9705556f 100644
--- a/cpp/include/cudf/detail/repeat.hpp
+++ b/cpp/include/cudf/detail/repeat.hpp
@@ -36,7 +36,7 @@ std::unique_ptr<table> repeat(
   table_view const& input_table,
   column_view const& count,
   bool check_count,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -48,7 +48,7 @@ std::unique_ptr<table> repeat(
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   size_type count,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/replace.hpp b/cpp/include/cudf/detail/replace.hpp
index 4c2c6e3b171..9721c6e9849 100644
--- a/cpp/include/cudf/detail/replace.hpp
+++ b/cpp/include/cudf/detail/replace.hpp
@@ -34,7 +34,7 @@ namespace detail {
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   cudf::column_view const& replacement,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -46,7 +46,7 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   scalar const& replacement,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -58,7 +58,7 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nulls(
   column_view const& input,
   replace_policy const& replace_policy,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -70,7 +70,7 @@ std::unique_ptr<column> replace_nulls(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   column_view const& replacement,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -82,7 +82,7 @@ std::unique_ptr<column> replace_nans(
 std::unique_ptr<column> replace_nans(
   column_view const& input,
   scalar const& replacement,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -94,7 +94,7 @@ std::unique_ptr<column> find_and_replace_all(
   column_view const& input_col,
   column_view const& values_to_replace,
   column_view const& replacement_values,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -104,7 +104,7 @@ std::unique_ptr<column> find_and_replace_all(
  */
 std::unique_ptr<column> normalize_nans_and_zeros(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/reshape.hpp b/cpp/include/cudf/detail/reshape.hpp
index be10b2c582d..ccffcbc61df 100644
--- a/cpp/include/cudf/detail/reshape.hpp
+++ b/cpp/include/cudf/detail/reshape.hpp
@@ -33,7 +33,7 @@ namespace detail {
 std::unique_ptr<table> tile(
   table_view const& input,
   size_type count,
-  rmm::cuda_stream_view               = cudf::default_stream_value,
+  rmm::cuda_stream_view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -43,7 +43,7 @@ std::unique_ptr<table> tile(
  */
 std::unique_ptr<column> interleave_columns(
   table_view const& input,
-  rmm::cuda_stream_view               = cudf::default_stream_value,
+  rmm::cuda_stream_view,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/rolling.hpp b/cpp/include/cudf/detail/rolling.hpp
index e0bdde98c0a..dcaece2bafc 100644
--- a/cpp/include/cudf/detail/rolling.hpp
+++ b/cpp/include/cudf/detail/rolling.hpp
@@ -45,7 +45,7 @@ std::unique_ptr<column> rolling_window(
   column_view const& following_window,
   size_type min_periods,
   rolling_aggregation const& agg,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/round.hpp b/cpp/include/cudf/detail/round.hpp
index 49e6c528eb3..1e5612919f4 100644
--- a/cpp/include/cudf/detail/round.hpp
+++ b/cpp/include/cudf/detail/round.hpp
@@ -35,7 +35,7 @@ std::unique_ptr<column> round(
   column_view const& input,
   int32_t decimal_places,
   rounding_method method,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/scan.hpp b/cpp/include/cudf/detail/scan.hpp
index 13dddd3b0c8..f4b2d51d0cb 100644
--- a/cpp/include/cudf/detail/scan.hpp
+++ b/cpp/include/cudf/detail/scan.hpp
@@ -38,7 +38,7 @@ namespace detail {
  *                           `agg` is not Min or Max.
  *
  * @param input The input column view for the scan.
- * @param agg unique_ptr to aggregation operator applied by the scan.
+ * @param agg Aggregation operator applied by the scan
  * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
  *                      Include nulls if null_policy::INCLUDE. Any operation with a null results in
  *                      a null.
@@ -47,7 +47,7 @@ namespace detail {
  * @returns Column with scan results.
  */
 std::unique_ptr<column> scan_exclusive(column_view const& input,
-                                       std::unique_ptr<scan_aggregation> const& agg,
+                                       scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
@@ -64,7 +64,7 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
  *                           but the `agg` is not Min or Max.
  *
  * @param input The input column view for the scan.
- * @param agg unique_ptr to aggregation operator applied by the scan.
+ * @param agg Aggregation operator applied by the scan
  * @param null_handling Exclude null values when computing the result if null_policy::EXCLUDE.
  *                      Include nulls if null_policy::INCLUDE. Any operation with a null results in
  *                      a null.
@@ -73,7 +73,7 @@ std::unique_ptr<column> scan_exclusive(column_view const& input,
  * @returns Column with scan results.
  */
 std::unique_ptr<column> scan_inclusive(column_view const& input,
-                                       std::unique_ptr<scan_aggregation> const& agg,
+                                       scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr);
diff --git a/cpp/include/cudf/detail/scatter.cuh b/cpp/include/cudf/detail/scatter.cuh
index 09b16b11a73..c8b17e22df2 100644
--- a/cpp/include/cudf/detail/scatter.cuh
+++ b/cpp/include/cudf/detail/scatter.cuh
@@ -79,14 +79,14 @@ auto scatter_to_gather(MapIterator scatter_map_begin,
   // We'll use the `numeric_limits::lowest()` value for this since it should always be outside the
   // valid range.
   auto gather_map = rmm::device_uvector<size_type>(gather_rows, stream);
-  thrust::uninitialized_fill(rmm::exec_policy(stream),
+  thrust::uninitialized_fill(rmm::exec_policy_nosync(stream),
                              gather_map.begin(),
                              gather_map.end(),
                              std::numeric_limits<size_type>::lowest());
 
   // Convert scatter map to a gather map
   thrust::scatter(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     thrust::make_counting_iterator<MapValueType>(0),
     thrust::make_counting_iterator<MapValueType>(std::distance(scatter_map_begin, scatter_map_end)),
     scatter_map_begin,
@@ -114,13 +114,13 @@ auto scatter_to_gather_complement(MapIterator scatter_map_begin,
                                   rmm::cuda_stream_view stream)
 {
   auto gather_map = rmm::device_uvector<size_type>(gather_rows, stream);
-  thrust::sequence(rmm::exec_policy(stream), gather_map.begin(), gather_map.end(), 0);
+  thrust::sequence(rmm::exec_policy_nosync(stream), gather_map.begin(), gather_map.end(), 0);
 
   auto const out_of_bounds_begin =
     thrust::make_constant_iterator(std::numeric_limits<size_type>::lowest());
   auto const out_of_bounds_end =
     out_of_bounds_begin + thrust::distance(scatter_map_begin, scatter_map_end);
-  thrust::scatter(rmm::exec_policy(stream),
+  thrust::scatter(rmm::exec_policy_nosync(stream),
                   out_of_bounds_begin,
                   out_of_bounds_end,
                   scatter_map_begin,
@@ -152,7 +152,7 @@ struct column_scatterer_impl<Element, std::enable_if_t<cudf::is_fixed_width<Elem
 
     // NOTE use source.begin + scatter rows rather than source.end in case the
     // scatter map is smaller than the number of source rows
-    thrust::scatter(rmm::exec_policy(stream),
+    thrust::scatter(rmm::exec_policy_nosync(stream),
                     source.begin<Element>(),
                     source.begin<Element>() + cudf::distance(scatter_map_begin, scatter_map_end),
                     scatter_map_begin,
@@ -218,14 +218,15 @@ struct column_scatterer_impl<dictionary32> {
     // first combine keys so both dictionaries have the same set
     auto target_matched    = dictionary::detail::add_keys(target, source.keys(), stream, mr);
     auto const target_view = dictionary_column_view(target_matched->view());
-    auto source_matched    = dictionary::detail::set_keys(source, target_view.keys(), stream);
+    auto source_matched    = dictionary::detail::set_keys(
+      source, target_view.keys(), stream, rmm::mr::get_current_device_resource());
     auto const source_view = dictionary_column_view(source_matched->view());
 
     // now build the new indices by doing a scatter on just the matched indices
     auto source_itr  = indexalator_factory::make_input_iterator(source_view.indices());
     auto new_indices = std::make_unique<column>(target_view.get_indices_annotated(), stream, mr);
     auto target_itr  = indexalator_factory::make_output_iterator(new_indices->mutable_view());
-    thrust::scatter(rmm::exec_policy(stream),
+    thrust::scatter(rmm::exec_policy_nosync(stream),
                     source_itr,
                     source_itr + std::distance(scatter_map_begin, scatter_map_end),
                     scatter_map_begin,
@@ -390,24 +391,13 @@ std::unique_ptr<table> scatter(
   MapIterator scatter_map_begin,
   MapIterator scatter_map_end,
   table_view const& target,
-  bool check_bounds                   = false,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_FUNC_RANGE();
 
   using MapType = typename thrust::iterator_traits<MapIterator>::value_type;
 
-  if (check_bounds) {
-    auto const begin = -target.num_rows();
-    auto const end   = target.num_rows();
-    auto bounds      = bounds_checker<MapType>{begin, end};
-    CUDF_EXPECTS(
-      std::distance(scatter_map_begin, scatter_map_end) ==
-        thrust::count_if(rmm::exec_policy(stream), scatter_map_begin, scatter_map_end, bounds),
-      "Scatter map index out of bounds");
-  }
-
   CUDF_EXPECTS(std::distance(scatter_map_begin, scatter_map_end) <= source.num_rows(),
                "scatter map size should be <= to number of rows in source");
 
diff --git a/cpp/include/cudf/detail/scatter.hpp b/cpp/include/cudf/detail/scatter.hpp
index 8c993368ff2..7c4b04537ea 100644
--- a/cpp/include/cudf/detail/scatter.hpp
+++ b/cpp/include/cudf/detail/scatter.hpp
@@ -45,10 +45,8 @@ namespace detail {
  *
  * If the same index appears more than once in the scatter map, the result is
  * undefined.
- *
- * @throws cudf::logic_error if `check_bounds == true` and an index exists in
- * `scatter_map` outside the range `[-n, n)`, where `n` is the number of rows in
- * the target table. If `check_bounds == false`, the behavior is undefined.
+ * If any values in `scatter_map` are outside of the interval [-n, n) where `n`
+ * is the number of rows in the `target` table, behavior is undefined.
  *
  * @param source The input columns containing values to be scattered into the
  * target columns
@@ -57,8 +55,6 @@ namespace detail {
  * to or less than the number of elements in the source columns.
  * @param target The set of columns into which values from the source_table
  * are to be scattered
- * @param check_bounds Optionally perform bounds checking on the values of
- * `scatter_map` and throw an error if any of its values are out of bounds.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
@@ -67,8 +63,7 @@ std::unique_ptr<table> scatter(
   table_view const& source,
   column_view const& scatter_map,
   table_view const& target,
-  bool check_bounds                   = false,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -81,8 +76,7 @@ std::unique_ptr<table> scatter(
   table_view const& source,
   device_span<size_type const> const scatter_map,
   table_view const& target,
-  bool check_bounds                   = false,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -101,9 +95,8 @@ std::unique_ptr<table> scatter(
  * If the same index appears more than once in the scatter map, the result is
  * undefined.
  *
- * @throws cudf::logic_error if `check_bounds == true` and an index exists in
- * `scatter_map` outside the range `[-n, n)`, where `n` is the number of rows in
- * the target table. If `check_bounds == false`, the behavior is undefined.
+ * If any values in `indices` are outside of the interval [-n, n) where `n`
+ * is the number of rows in the `target` table, behavior is undefined.
  *
  * @param source The input scalars containing values to be scattered into the
  * target columns
@@ -111,8 +104,6 @@ std::unique_ptr<table> scatter(
  * the rows in the target table to be replaced by source.
  * @param target The set of columns into which values from the source_table
  * are to be scattered
- * @param check_bounds Optionally perform bounds checking on the values of
- * `scatter_map` and throw an error if any of its values are out of bounds.
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return Result of scattering values from source to target
@@ -121,8 +112,7 @@ std::unique_ptr<table> scatter(
   std::vector<std::reference_wrapper<const scalar>> const& source,
   column_view const& indices,
   table_view const& target,
-  bool check_bounds                   = false,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -137,7 +127,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   table_view const& source,
   table_view const& target,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -153,7 +143,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   std::vector<std::reference_wrapper<const scalar>> const& source,
   table_view const& target,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/sequence.hpp b/cpp/include/cudf/detail/sequence.hpp
index 8b3ef46d0ad..4a9bf5c74e1 100644
--- a/cpp/include/cudf/detail/sequence.hpp
+++ b/cpp/include/cudf/detail/sequence.hpp
@@ -36,7 +36,7 @@ std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
   scalar const& step,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -49,7 +49,7 @@ std::unique_ptr<column> sequence(
 std::unique_ptr<column> sequence(
   size_type size,
   scalar const& init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -64,7 +64,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(
   size_type size,
   scalar const& init,
   size_type months,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/sorting.hpp b/cpp/include/cudf/detail/sorting.hpp
index a68407d9194..66b3f5071c6 100644
--- a/cpp/include/cudf/detail/sorting.hpp
+++ b/cpp/include/cudf/detail/sorting.hpp
@@ -36,7 +36,7 @@ std::unique_ptr<column> sorted_order(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::cuda_stream_view stream                   = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -48,7 +48,7 @@ std::unique_ptr<column> stable_sorted_order(
   table_view const& input,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::cuda_stream_view stream                   = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -61,7 +61,7 @@ std::unique_ptr<table> sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::cuda_stream_view stream                   = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -74,7 +74,7 @@ std::unique_ptr<table> stable_sort_by_key(
   table_view const& keys,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::cuda_stream_view stream                   = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -87,7 +87,7 @@ std::unique_ptr<column> segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::cuda_stream_view stream                   = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -100,7 +100,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::cuda_stream_view stream                   = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -114,7 +114,7 @@ std::unique_ptr<table> segmented_sort_by_key(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::cuda_stream_view stream                   = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -128,7 +128,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(
   column_view const& segment_offsets,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::cuda_stream_view stream                   = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 /**
@@ -140,7 +140,7 @@ std::unique_ptr<table> sort(
   table_view const& values,
   std::vector<order> const& column_order         = {},
   std::vector<null_order> const& null_precedence = {},
-  rmm::cuda_stream_view stream                   = cudf::default_stream_value,
+  rmm::cuda_stream_view stream                   = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr            = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/stream_compaction.hpp b/cpp/include/cudf/detail/stream_compaction.hpp
index 0db929c523c..e725718ed22 100644
--- a/cpp/include/cudf/detail/stream_compaction.hpp
+++ b/cpp/include/cudf/detail/stream_compaction.hpp
@@ -36,7 +36,7 @@ std::unique_ptr<table> drop_nulls(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -49,7 +49,7 @@ std::unique_ptr<table> drop_nans(
   table_view const& input,
   std::vector<size_type> const& keys,
   cudf::size_type keep_threshold,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -60,7 +60,7 @@ std::unique_ptr<table> drop_nans(
 std::unique_ptr<table> apply_boolean_mask(
   table_view const& input,
   column_view const& boolean_mask,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -73,7 +73,7 @@ std::unique_ptr<table> unique(
   std::vector<size_type> const& keys,
   duplicate_keep_option keep,
   null_equality nulls_equal           = null_equality::EQUAL,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -87,7 +87,7 @@ std::unique_ptr<table> distinct(
   duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -116,7 +116,7 @@ std::unique_ptr<table> stable_distinct(
   duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -138,7 +138,7 @@ rmm::device_uvector<size_type> get_distinct_indices(
   duplicate_keep_option keep          = duplicate_keep_option::KEEP_ANY,
   null_equality nulls_equal           = null_equality::EQUAL,
   nan_equality nans_equal             = nan_equality::ALL_EQUAL,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -149,7 +149,7 @@ rmm::device_uvector<size_type> get_distinct_indices(
 cudf::size_type unique_count(column_view const& input,
                              null_policy null_handling,
                              nan_policy nan_handling,
-                             rmm::cuda_stream_view stream = cudf::default_stream_value);
+                             rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::unique_count(table_view const&, null_equality)
@@ -158,7 +158,7 @@ cudf::size_type unique_count(column_view const& input,
  */
 cudf::size_type unique_count(table_view const& input,
                              null_equality nulls_equal    = null_equality::EQUAL,
-                             rmm::cuda_stream_view stream = cudf::default_stream_value);
+                             rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @copydoc cudf::distinct_count(column_view const&, null_policy, nan_policy)
@@ -168,7 +168,7 @@ cudf::size_type unique_count(table_view const& input,
 cudf::size_type distinct_count(column_view const& input,
                                null_policy null_handling,
                                nan_policy nan_handling,
-                               rmm::cuda_stream_view stream = cudf::default_stream_value);
+                               rmm::cuda_stream_view stream);
 
 /**
  * @copydoc cudf::distinct_count(table_view const&, null_equality)
@@ -177,7 +177,7 @@ cudf::size_type distinct_count(column_view const& input,
  */
 cudf::size_type distinct_count(table_view const& input,
                                null_equality nulls_equal    = null_equality::EQUAL,
-                               rmm::cuda_stream_view stream = cudf::default_stream_value);
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 }  // namespace detail
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/structs/utilities.hpp b/cpp/include/cudf/detail/structs/utilities.hpp
index 1a4b8f02dd3..115c8ccd90e 100644
--- a/cpp/include/cudf/detail/structs/utilities.hpp
+++ b/cpp/include/cudf/detail/structs/utilities.hpp
@@ -189,7 +189,7 @@ void superimpose_parent_nulls(bitmask_type const* parent_null_mask,
  */
 std::tuple<cudf::column_view, std::vector<rmm::device_buffer>> superimpose_parent_nulls(
   column_view const& parent,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -215,7 +215,7 @@ std::tuple<cudf::column_view, std::vector<rmm::device_buffer>> superimpose_paren
  */
 std::tuple<cudf::table_view, std::vector<rmm::device_buffer>> superimpose_parent_nulls(
   table_view const& table,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/include/cudf/detail/tdigest/tdigest.hpp b/cpp/include/cudf/detail/tdigest/tdigest.hpp
index 41e734ffe83..9df3f9daf3f 100644
--- a/cpp/include/cudf/detail/tdigest/tdigest.hpp
+++ b/cpp/include/cudf/detail/tdigest/tdigest.hpp
@@ -23,9 +23,8 @@
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
-namespace detail {
-
 namespace tdigest {
+namespace detail {
 
 /**
  * @brief Generate a tdigest column from a grouped set of numeric input values.
@@ -139,7 +138,7 @@ std::unique_ptr<column> make_tdigest_column(
   std::unique_ptr<column>&& tdigest_offsets,
   std::unique_ptr<column>&& min_values,
   std::unique_ptr<column>&& max_values,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -153,7 +152,7 @@ std::unique_ptr<column> make_tdigest_column(
  * @returns An empty tdigest column.
  */
 std::unique_ptr<column> make_empty_tdigest_column(
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -167,7 +166,7 @@ std::unique_ptr<column> make_empty_tdigest_column(
  * @returns An empty tdigest scalar.
  */
 std::unique_ptr<scalar> make_empty_tdigest_scalar(
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -328,6 +327,6 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr);
 
-}  // namespace tdigest
 }  // namespace detail
+}  // namespace tdigest
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/transform.hpp b/cpp/include/cudf/detail/transform.hpp
index 929c4700873..8e19ebb8da7 100644
--- a/cpp/include/cudf/detail/transform.hpp
+++ b/cpp/include/cudf/detail/transform.hpp
@@ -34,7 +34,7 @@ std::unique_ptr<column> transform(
   std::string const& unary_udf,
   data_type output_type,
   bool is_ptx,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -45,7 +45,7 @@ std::unique_ptr<column> transform(
 std::unique_ptr<column> compute_column(
   table_view const table,
   ast::operation const& expr,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -55,7 +55,7 @@ std::unique_ptr<column> compute_column(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -65,7 +65,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, size_type> nans_to_nulls(
  */
 std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -75,7 +75,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
  */
 std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -86,7 +86,7 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
 std::pair<std::unique_ptr<column>, table_view> one_hot_encode(
   column_view const& input,
   column_view const& categories,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -98,7 +98,7 @@ std::unique_ptr<column> mask_to_bools(
   bitmask_type const* null_mask,
   size_type begin_bit,
   size_type end_bit,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -108,7 +108,7 @@ std::unique_ptr<column> mask_to_bools(
  */
 std::unique_ptr<column> row_bit_count(
   table_view const& t,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/transpose.hpp b/cpp/include/cudf/detail/transpose.hpp
index 367421a5ee1..0470d625edc 100644
--- a/cpp/include/cudf/detail/transpose.hpp
+++ b/cpp/include/cudf/detail/transpose.hpp
@@ -30,7 +30,7 @@ namespace detail {
  */
 std::pair<std::unique_ptr<column>, table_view> transpose(
   table_view const& input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/unary.hpp b/cpp/include/cudf/detail/unary.hpp
index 5d1c29aba78..0e1c047d9b0 100644
--- a/cpp/include/cudf/detail/unary.hpp
+++ b/cpp/include/cudf/detail/unary.hpp
@@ -50,7 +50,7 @@ std::unique_ptr<column> true_if(
   InputIterator end,
   size_type size,
   Predicate p,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto output =
@@ -71,7 +71,7 @@ std::unique_ptr<column> true_if(
 std::unique_ptr<cudf::column> unary_operation(
   cudf::column_view const& input,
   cudf::unary_operator op,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -82,7 +82,7 @@ std::unique_ptr<cudf::column> unary_operation(
 std::unique_ptr<column> cast(
   column_view const& input,
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -92,7 +92,7 @@ std::unique_ptr<column> cast(
  */
 std::unique_ptr<column> is_nan(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -102,7 +102,7 @@ std::unique_ptr<column> is_nan(
  */
 std::unique_ptr<column> is_not_nan(
   cudf::column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/detail/utilities/algorithm.cuh b/cpp/include/cudf/detail/utilities/algorithm.cuh
index f05a09a8df1..4e83e219072 100644
--- a/cpp/include/cudf/detail/utilities/algorithm.cuh
+++ b/cpp/include/cudf/detail/utilities/algorithm.cuh
@@ -25,4 +25,4 @@ __device__ __forceinline__ T accumulate(Iterator first, Iterator last, T init, B
   }
   return init;
 }
-}  // namespace cudf::detail
\ No newline at end of file
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/cuda.cuh b/cpp/include/cudf/detail/utilities/cuda.cuh
index d57078f892f..cdbc26701d1 100644
--- a/cpp/include/cudf/detail/utilities/cuda.cuh
+++ b/cpp/include/cudf/detail/utilities/cuda.cuh
@@ -170,8 +170,7 @@ __global__ void single_thread_kernel(F f)
  * @param stream CUDA stream used for the kernel launch
  */
 template <class Functor>
-void device_single_thread(Functor functor,
-                          rmm::cuda_stream_view stream = cudf::default_stream_value)
+void device_single_thread(Functor functor, rmm::cuda_stream_view stream)
 {
   single_thread_kernel<<<1, 1, 0, stream.value()>>>(functor);
 }
diff --git a/cpp/tests/strings/utilities.h b/cpp/include/cudf/detail/utilities/default_stream.hpp
similarity index 64%
rename from cpp/tests/strings/utilities.h
rename to cpp/include/cudf/detail/utilities/default_stream.hpp
index d6f0e9c4f1f..fa438f142b7 100644
--- a/cpp/tests/strings/utilities.h
+++ b/cpp/include/cudf/detail/utilities/default_stream.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,18 +13,24 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+
 #pragma once
 
-#include <cudf/column/column.hpp>
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
-namespace test {
+
+namespace detail {
+
 /**
- * @brief Utility will verify the given strings column is empty.
+ * @brief Default stream for cudf
  *
- * @param strings_column Column of strings to check
+ * Use this value to ensure the correct stream is used when compiled with per
+ * thread default stream.
  */
-void expect_strings_empty(cudf::column_view strings_column);
+extern rmm::cuda_stream_view const default_stream_value;
+
+}  // namespace detail
 
-}  // namespace test
 }  // namespace cudf
diff --git a/cpp/include/cudf/detail/utilities/linked_column.hpp b/cpp/include/cudf/detail/utilities/linked_column.hpp
index 05b46cc8e13..059e32730e5 100644
--- a/cpp/include/cudf/detail/utilities/linked_column.hpp
+++ b/cpp/include/cudf/detail/utilities/linked_column.hpp
@@ -77,4 +77,4 @@ inline LinkedColVector table_to_linked_columns(table_view const& table)
   return LinkedColVector(linked_it, linked_it + table.num_columns());
 }
 
-}  // namespace cudf::detail
\ No newline at end of file
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/pinned_allocator.hpp b/cpp/include/cudf/detail/utilities/pinned_allocator.hpp
new file mode 100644
index 00000000000..84abf7c014f
--- /dev/null
+++ b/cpp/include/cudf/detail/utilities/pinned_allocator.hpp
@@ -0,0 +1,202 @@
+/*
+ *  Copyright 2008-2022 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#pragma once
+
+#include <cstddef>
+#include <limits>
+#include <new>  // for bad_alloc
+
+#include <cudf/utilities/error.hpp>
+
+namespace cudf::detail {
+
+/*! \p pinned_allocator is a CUDA-specific host memory allocator
+ *  that employs \c cudaMallocHost for allocation.
+ *
+ * This implementation is ported from the experimental/pinned_allocator
+ * that Thrust used to provide.
+ *
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
+ */
+template <typename T>
+class pinned_allocator;
+
+/*! \p pinned_allocator is a CUDA-specific host memory allocator
+ *  that employs \c cudaMallocHost for allocation.
+ *
+ * This implementation is ported from the experimental/pinned_allocator
+ * that Thrust used to provide.
+ *
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
+ */
+template <>
+class pinned_allocator<void> {
+ public:
+  using value_type      = void;            ///< The type of the elements in the allocator
+  using pointer         = void*;           ///< The type returned by address() / allocate()
+  using const_pointer   = const void*;     ///< The type returned by address()
+  using size_type       = std::size_t;     ///< The type used for the size of the allocation
+  using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
+
+  /**
+   * @brief converts a `pinned_allocator<void>` to `pinned_allocator<U>`
+   */
+  template <typename U>
+  struct rebind {
+    using other = pinned_allocator<U>;  ///< The rebound type
+  };
+};
+
+/*! \p pinned_allocator is a CUDA-specific host memory allocator
+ *  that employs \c cudaMallocHost for allocation.
+ *
+ * This implementation is ported from the experimental/pinned_allocator
+ * that Thrust used to provide.
+ *
+ *  \see https://en.cppreference.com/w/cpp/memory/allocator
+ */
+template <typename T>
+class pinned_allocator {
+ public:
+  using value_type      = T;               ///< The type of the elements in the allocator
+  using pointer         = T*;              ///< The type returned by address() / allocate()
+  using const_pointer   = const T*;        ///< The type returned by address()
+  using reference       = T&;              ///< The parameter type for address()
+  using const_reference = const T&;        ///< The parameter type for address()
+  using size_type       = std::size_t;     ///< The type used for the size of the allocation
+  using difference_type = std::ptrdiff_t;  ///< The type of the distance between two pointers
+
+  /**
+   * @brief converts a `pinned_allocator<T>` to `pinned_allocator<U>`
+   */
+  template <typename U>
+  struct rebind {
+    using other = pinned_allocator<U>;  ///< The rebound type
+  };
+
+  /**
+   * @brief pinned_allocator's null constructor does nothing.
+   */
+  __host__ __device__ inline pinned_allocator() {}
+
+  /**
+   * @brief pinned_allocator's null destructor does nothing.
+   */
+  __host__ __device__ inline ~pinned_allocator() {}
+
+  /**
+   * @brief pinned_allocator's copy constructor does nothing.
+   */
+  __host__ __device__ inline pinned_allocator(pinned_allocator const&) {}
+
+  /**
+   * @brief  pinned_allocator's copy constructor does nothing.
+   *
+   *  This version of pinned_allocator's copy constructor
+   *  is templated on the \c value_type of the pinned_allocator
+   *  to copy from.  It is provided merely for convenience; it
+   *  does nothing.
+   */
+  template <typename U>
+  __host__ __device__ inline pinned_allocator(pinned_allocator<U> const&)
+  {
+  }
+
+  /**
+   * @brief This method returns the address of a \c reference of
+   *  interest.
+   *
+   *  @param r The \c reference of interest.
+   *  @return \c r's address.
+   */
+  __host__ __device__ inline pointer address(reference r) { return &r; }
+
+  /**
+   * @brief This method returns the address of a \c const_reference
+   *  of interest.
+   *
+   *  @param r The \c const_reference of interest.
+   *  @return \c r's address.
+   */
+  __host__ __device__ inline const_pointer address(const_reference r) { return &r; }
+
+  /**
+   * @brief This method allocates storage for objects in pinned host
+   *  memory.
+   *
+   *  @param cnt The number of objects to allocate.
+   *  @return a \c pointer to the newly allocated objects.
+   *  @note The second parameter to this function is meant as a
+   *        hint pointer to a nearby memory location, but is
+   *        not used by this allocator.
+   *  @note This method does not invoke \p value_type's constructor.
+   *        It is the responsibility of the caller to initialize the
+   *        objects at the returned \c pointer.
+   */
+  __host__ inline pointer allocate(size_type cnt, const_pointer /*hint*/ = 0)
+  {
+    if (cnt > this->max_size()) { throw std::bad_alloc(); }  // end if
+
+    pointer result(0);
+    CUDF_CUDA_TRY(cudaMallocHost(reinterpret_cast<void**>(&result), cnt * sizeof(value_type)));
+    return result;
+  }
+
+  /**
+   * @brief This method deallocates pinned host memory previously allocated
+   *  with this \c pinned_allocator.
+   *
+   *  @param p A \c pointer to the previously allocated memory.
+   *  @note The second parameter is the number of objects previously allocated
+   *        but is ignored by this allocator.
+   *  @note This method does not invoke \p value_type's destructor.
+   *        It is the responsibility of the caller to destroy
+   *        the objects stored at \p p.
+   */
+  __host__ inline void deallocate(pointer p, size_type /*cnt*/) { CUDF_CUDA_TRY(cudaFreeHost(p)); }
+
+  /**
+   * @brief This method returns the maximum size of the \c cnt parameter
+   *  accepted by the \p allocate() method.
+   *
+   *  @return The maximum number of objects that may be allocated
+   *          by a single call to \p allocate().
+   */
+  inline size_type max_size() const { return (std::numeric_limits<size_type>::max)() / sizeof(T); }
+
+  /**
+   * @brief This method tests this \p pinned_allocator for equality to
+   *  another.
+   *
+   *  @param x The other \p pinned_allocator of interest.
+   *  @return This method always returns \c true.
+   */
+  __host__ __device__ inline bool operator==(pinned_allocator const& x) const { return true; }
+
+  /**
+   * @brief This method tests this \p pinned_allocator for inequality
+   *  to another.
+   *
+   *  @param x The other \p pinned_allocator of interest.
+   *  @return This method always returns \c false.
+   */
+  __host__ __device__ inline bool operator!=(pinned_allocator const& x) const
+  {
+    return !operator==(x);
+  }
+};
+}  // namespace cudf::detail
diff --git a/cpp/include/cudf/detail/utilities/vector_factories.hpp b/cpp/include/cudf/detail/utilities/vector_factories.hpp
index d7fdb153c19..75e5222ab97 100644
--- a/cpp/include/cudf/detail/utilities/vector_factories.hpp
+++ b/cpp/include/cudf/detail/utilities/vector_factories.hpp
@@ -72,7 +72,7 @@ rmm::device_uvector<T> make_zeroed_device_uvector_async(
 template <typename T>
 rmm::device_uvector<T> make_zeroed_device_uvector_sync(
   std::size_t size,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(size, stream, mr);
@@ -148,7 +148,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_async(
   device_span<T const> source_data,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   rmm::device_uvector<T> ret(source_data.size(), stream, mr);
@@ -201,7 +201,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_async(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_sync(
   host_span<T const> source_data,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
@@ -228,7 +228,7 @@ template <
     std::is_convertible_v<Container, host_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
   Container const& c,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   return make_device_uvector_sync(host_span<typename Container::value_type const>{c}, stream, mr);
@@ -249,7 +249,7 @@ rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
 template <typename T>
 rmm::device_uvector<T> make_device_uvector_sync(
   device_span<T const> source_data,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto ret = make_device_uvector_async(source_data, stream, mr);
@@ -276,7 +276,7 @@ template <
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 rmm::device_uvector<typename Container::value_type> make_device_uvector_sync(
   Container const& c,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   return make_device_uvector_sync(device_span<typename Container::value_type const>{c}, stream, mr);
@@ -366,8 +366,8 @@ template <
   typename Container,
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
-std::vector<typename Container::value_type> make_std_vector_sync(
-  Container const& c, rmm::cuda_stream_view stream = cudf::default_stream_value)
+std::vector<typename Container::value_type> make_std_vector_sync(Container const& c,
+                                                                 rmm::cuda_stream_view stream)
 {
   return make_std_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
@@ -423,8 +423,7 @@ thrust::host_vector<typename Container::value_type> make_host_vector_async(
  * @return The data copied to the host
  */
 template <typename T>
-thrust::host_vector<T> make_host_vector_sync(
-  device_span<T const> v, rmm::cuda_stream_view stream = cudf::default_stream_value)
+thrust::host_vector<T> make_host_vector_sync(device_span<T const> v, rmm::cuda_stream_view stream)
 {
   auto result = make_host_vector_async(v, stream);
   stream.synchronize();
@@ -448,7 +447,7 @@ template <
   std::enable_if_t<
     std::is_convertible_v<Container, device_span<typename Container::value_type const>>>* = nullptr>
 thrust::host_vector<typename Container::value_type> make_host_vector_sync(
-  Container const& c, rmm::cuda_stream_view stream = cudf::default_stream_value)
+  Container const& c, rmm::cuda_stream_view stream)
 {
   return make_host_vector_sync(device_span<typename Container::value_type const>{c}, stream);
 }
diff --git a/cpp/include/cudf/detail/valid_if.cuh b/cpp/include/cudf/detail/valid_if.cuh
index 0fe7edad21d..04c78bed17d 100644
--- a/cpp/include/cudf/detail/valid_if.cuh
+++ b/cpp/include/cudf/detail/valid_if.cuh
@@ -90,7 +90,7 @@ std::pair<rmm::device_buffer, size_type> valid_if(
   InputIterator begin,
   InputIterator end,
   Predicate p,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(begin <= end, "Invalid range.");
diff --git a/cpp/include/cudf/dictionary/detail/concatenate.hpp b/cpp/include/cudf/dictionary/detail/concatenate.hpp
index 9f154a054f8..d74429484ce 100644
--- a/cpp/include/cudf/dictionary/detail/concatenate.hpp
+++ b/cpp/include/cudf/dictionary/detail/concatenate.hpp
@@ -37,10 +37,9 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with concatenated results.
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/encode.hpp b/cpp/include/cudf/dictionary/detail/encode.hpp
index 17173564a9a..2aad7dd80ed 100644
--- a/cpp/include/cudf/dictionary/detail/encode.hpp
+++ b/cpp/include/cudf/dictionary/detail/encode.hpp
@@ -51,11 +51,10 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Returns a dictionary column.
  */
-std::unique_ptr<column> encode(
-  column_view const& column,
-  data_type indices_type              = data_type{type_id::UINT32},
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> encode(column_view const& column,
+                               data_type indices_type,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create a column by gathering the keys from the provided
@@ -72,10 +71,9 @@ std::unique_ptr<column> encode(
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with type matching the dictionary_column's keys.
  */
-std::unique_ptr<column> decode(
-  dictionary_column_view const& dictionary_column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> decode(dictionary_column_view const& dictionary_column,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Return minimal integer type for the given number of elements.
diff --git a/cpp/include/cudf/dictionary/detail/replace.hpp b/cpp/include/cudf/dictionary/detail/replace.hpp
index 2b38a6c40ec..0778baa84d6 100644
--- a/cpp/include/cudf/dictionary/detail/replace.hpp
+++ b/cpp/include/cudf/dictionary/detail/replace.hpp
@@ -39,11 +39,10 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column with null rows replaced.
  */
-std::unique_ptr<column> replace_nulls(
-  dictionary_column_view const& input,
-  dictionary_column_view const& replacement,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
+                                      dictionary_column_view const& replacement,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create a new dictionary column by replacing nulls with a
@@ -57,11 +56,10 @@ std::unique_ptr<column> replace_nulls(
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New dictionary column with null rows replaced.
  */
-std::unique_ptr<column> replace_nulls(
-  dictionary_column_view const& input,
-  scalar const& replacement,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
+                                      scalar const& replacement,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/search.hpp b/cpp/include/cudf/dictionary/detail/search.hpp
index 4f7939b32a7..62059306b9a 100644
--- a/cpp/include/cudf/dictionary/detail/search.hpp
+++ b/cpp/include/cudf/dictionary/detail/search.hpp
@@ -31,11 +31,10 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<scalar> get_index(
-  dictionary_column_view const& dictionary,
-  scalar const& key,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
+                                  scalar const& key,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Get the index for a key if it were added to the given dictionary.
@@ -56,11 +55,10 @@ std::unique_ptr<scalar> get_index(
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Numeric scalar index value of the key within the dictionary
  */
-std::unique_ptr<scalar> get_insert_index(
-  dictionary_column_view const& dictionary,
-  scalar const& key,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<scalar> get_insert_index(dictionary_column_view const& dictionary,
+                                         scalar const& key,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/detail/update_keys.hpp b/cpp/include/cudf/dictionary/detail/update_keys.hpp
index 53fd71e0375..6fd743ad526 100644
--- a/cpp/include/cudf/dictionary/detail/update_keys.hpp
+++ b/cpp/include/cudf/dictionary/detail/update_keys.hpp
@@ -32,11 +32,10 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> add_keys(
-  dictionary_column_view const& dictionary_column,
-  column_view const& new_keys,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
+                                 column_view const& new_keys,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::dictionary::remove_keys(dictionary_column_view const&,column_view
@@ -44,11 +43,10 @@ std::unique_ptr<column> add_keys(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> remove_keys(
-  dictionary_column_view const& dictionary_column,
-  column_view const& keys_to_remove,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
+                                    column_view const& keys_to_remove,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::dictionary::remove_unused_keys(dictionary_column_view
@@ -56,10 +54,9 @@ std::unique_ptr<column> remove_keys(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> remove_unused_keys(
-  dictionary_column_view const& dictionary_column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::dictionary::set_keys(dictionary_column_view
@@ -67,11 +64,10 @@ std::unique_ptr<column> remove_unused_keys(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> set_keys(
-  dictionary_column_view const& dictionary_column,
-  column_view const& keys,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
+                                 column_view const& keys,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc
@@ -81,8 +77,8 @@ std::unique_ptr<column> set_keys(
  */
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Create new dictionaries that have keys merged from dictionary columns
@@ -105,8 +101,8 @@ std::vector<std::unique_ptr<column>> match_dictionaries(
  */
 std::pair<std::vector<std::unique_ptr<column>>, std::vector<table_view>> match_dictionaries(
   std::vector<table_view> tables,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace dictionary
diff --git a/cpp/include/cudf/dictionary/dictionary_factories.hpp b/cpp/include/cudf/dictionary/dictionary_factories.hpp
index b27fa25a27a..821981ad148 100644
--- a/cpp/include/cudf/dictionary/dictionary_factories.hpp
+++ b/cpp/include/cudf/dictionary/dictionary_factories.hpp
@@ -65,7 +65,7 @@ namespace cudf {
 std::unique_ptr<column> make_dictionary_column(
   column_view const& keys_column,
   column_view const& indices_column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -117,7 +117,7 @@ std::unique_ptr<column> make_dictionary_column(std::unique_ptr<column> keys_colu
 std::unique_ptr<column> make_dictionary_column(
   std::unique_ptr<column> keys_column,
   std::unique_ptr<column> indices_column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/dictionary/encode.hpp b/cpp/include/cudf/dictionary/encode.hpp
index 50b81187091..fb13eabe11a 100644
--- a/cpp/include/cudf/dictionary/encode.hpp
+++ b/cpp/include/cudf/dictionary/encode.hpp
@@ -31,7 +31,7 @@ namespace dictionary {
  */
 
 /**
- * @brief Construct a dictionary column by dictionary encoding an existing column.
+ * @brief Construct a dictionary column by dictionary encoding an existing column
  *
  * The output column is a DICTIONARY type with a keys column of non-null, unique values
  * that are in a strict, total order. Meaning, `keys[i]` is _ordered before
@@ -40,21 +40,21 @@ namespace dictionary {
  * The output column has a child indices column that is of integer type and with
  * the same size as the input column.
  *
- * The null_mask and null count are copied from the input column to the output column.
+ * The null mask and null count are copied from the input column to the output column.
  *
- * @throw cudf::logic_error if indices type is not an unsigned integer type.
- * @throw cudf::logic_error if the column to encode is already a DICTIONARY type.
+ * @throw cudf::logic_error if indices type is not an unsigned integer type
+ * @throw cudf::logic_error if the column to encode is already a DICTIONARY type
  *
  * @code{.pseudo}
- * c = [429,111,213,111,213,429,213]
- * d = make_dictionary_column(c)
- * d now has keys [111,213,429] and indices [2,0,1,0,1,2,1]
+ * c = [429, 111, 213, 111, 213, 429, 213]
+ * d = encode(c)
+ * d now has keys [111, 213, 429] and indices [2, 0, 1, 0, 1, 2, 1]
  * @endcode
  *
- * @param column The column to dictionary encode.
- * @param indices_type The integer type to use for the indices.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return Returns a dictionary column.
+ * @param column The column to dictionary encode
+ * @param indices_type The integer type to use for the indices
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return Returns a dictionary column
  */
 std::unique_ptr<column> encode(
   column_view const& column,
@@ -66,14 +66,14 @@ std::unique_ptr<column> encode(
  * dictionary_column into a new column using the indices from that column.
  *
  * @code{.pseudo}
- * d1 = {["a","c","d"],[2,0,1,0]}
+ * d1 = {["a", "c", "d"], [2, 0, 1, 0]}
  * s = decode(d1)
- * s is now ["d","a","c","a"]
+ * s is now ["d", "a", "c", "a"]
  * @endcode
  *
- * @param dictionary_column Existing dictionary column.
- * @param mr Device memory resource used to allocate the returned column's device memory.
- * @return New column with type matching the dictionary_column's keys.
+ * @param dictionary_column Existing dictionary column
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column with type matching the dictionary_column's keys
  */
 std::unique_ptr<column> decode(
   dictionary_column_view const& dictionary_column,
diff --git a/cpp/include/cudf/filling.hpp b/cpp/include/cudf/filling.hpp
index 5f9d13f9a2c..8688e97ab7e 100644
--- a/cpp/include/cudf/filling.hpp
+++ b/cpp/include/cudf/filling.hpp
@@ -103,26 +103,22 @@ std::unique_ptr<column> fill(
  * ```
  * @p count should not have null values; should not contain negative values;
  * and the sum of count elements should not overflow the size_type's limit.
- * It is undefined behavior if @p count has negative values or the sum overflows
- * and @p check_count is set to false.
+ * The behavior of this function is undefined if @p count has negative values
+ * or the sum overflows.
  *
  * @throws cudf::logic_error if the data type of @p count is not size_type.
  * @throws cudf::logic_error if @p input_table and @p count have different
  * number of rows.
  * @throws cudf::logic_error if @p count has null values.
- * @throws cudf::logic_error if @p check_count is set to true and @p count
- * has negative values or the sum of @p count elements overflows.
  *
  * @param input_table Input table
  * @param count Non-nullable column of an integral type
- * @param check_count Whether to check count (negative values and overflow)
  * @param mr Device memory resource used to allocate the returned table's device memory
  * @return The result table containing the repetitions
  */
 std::unique_ptr<table> repeat(
   table_view const& input_table,
   column_view const& count,
-  bool check_count                    = false,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/include/cudf/interop.hpp b/cpp/include/cudf/interop.hpp
index 016e23688c7..c1c58f136d6 100644
--- a/cpp/include/cudf/interop.hpp
+++ b/cpp/include/cudf/interop.hpp
@@ -16,7 +16,12 @@
 
 #pragma once
 
+// We disable warning 611 because the `arrow::TableBatchReader` only partially
+// override the `ReadNext` method of `arrow::RecordBatchReader::ReadNext`
+// triggering warning 611-D from nvcc.
+#pragma nv_diag_suppress 611
 #include <arrow/api.h>
+#pragma nv_diag_default 611
 
 #include <cudf/column/column.hpp>
 #include <cudf/detail/transform.hpp>
diff --git a/cpp/include/cudf/io/csv.hpp b/cpp/include/cudf/io/csv.hpp
index f753028a148..1fc4114b94c 100644
--- a/cpp/include/cudf/io/csv.hpp
+++ b/cpp/include/cudf/io/csv.hpp
@@ -1338,8 +1338,8 @@ class csv_writer_options {
   std::string _true_value = std::string{"true"};
   // string to use for values == 0 in INT8 types (default 'false')
   std::string _false_value = std::string{"false"};
-  // Optional associated metadata
-  table_metadata const* _metadata = nullptr;
+  // Names of all columns; if empty, writer will generate column names
+  std::vector<std::string> _names;
 
   /**
    * @brief Constructor from sink and table.
@@ -1387,11 +1387,11 @@ class csv_writer_options {
   [[nodiscard]] table_view const& get_table() const { return _table; }
 
   /**
-   * @brief Returns optional associated metadata.
+   * @brief Returns names of the columns.
    *
-   * @return Optional associated metadata
+   * @return Names of the columns in the output file
    */
-  [[nodiscard]] table_metadata const* get_metadata() const { return _metadata; }
+  [[nodiscard]] std::vector<std::string> const& get_names() const { return _names; }
 
   /**
    * @brief Returns string to used for null entries.
@@ -1444,11 +1444,11 @@ class csv_writer_options {
 
   // Setter
   /**
-   * @brief Sets optional associated metadata.
+   * @brief Sets optional associated column names.
    *
-   @param metadata Associated metadata
+   @param names Associated column names
    */
-  void set_metadata(table_metadata* metadata) { _metadata = metadata; }
+  void set_names(std::vector<std::string> names) { _names = std::move(names); }
 
   /**
    * @brief Sets string to used for null entries.
@@ -1526,14 +1526,14 @@ class csv_writer_options_builder {
   }
 
   /**
-   * @brief Sets optional associated metadata.
+   * @brief Sets optional column names.
    *
-   * @param metadata Associated metadata
+   * @param names Column names
    * @return this for chaining
    */
-  csv_writer_options_builder& metadata(table_metadata* metadata)
+  csv_writer_options_builder& names(std::vector<std::string> names)
   {
-    options._metadata = metadata;
+    options._names = names;
     return *this;
   }
 
diff --git a/cpp/include/cudf/io/datasource.hpp b/cpp/include/cudf/io/datasource.hpp
index 907830de2bb..251a93ac21f 100644
--- a/cpp/include/cudf/io/datasource.hpp
+++ b/cpp/include/cudf/io/datasource.hpp
@@ -22,8 +22,15 @@
 #include <rmm/cuda_stream_view.hpp>
 
 #include <arrow/buffer.h>
+
+// We disable warning 611 because some Arrow subclasses of
+// `arrow::fs::FileSystem` only partially override the `Equals` method,
+// triggering warning 611-D from nvcc.
+#pragma nv_diag_suppress 611
 #include <arrow/filesystem/filesystem.h>
 #include <arrow/filesystem/s3fs.h>
+#pragma nv_diag_default 611
+
 #include <arrow/io/file.h>
 #include <arrow/io/interfaces.h>
 #include <arrow/io/memory.h>
diff --git a/cpp/include/cudf/io/detail/avro.hpp b/cpp/include/cudf/io/detail/avro.hpp
index 9551b1f05df..c141e25f939 100644
--- a/cpp/include/cudf/io/detail/avro.hpp
+++ b/cpp/include/cudf/io/detail/avro.hpp
@@ -39,7 +39,7 @@ namespace avro {
 table_with_metadata read_avro(
   std::unique_ptr<cudf::io::datasource>&& source,
   avro_reader_options const& options,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace avro
diff --git a/cpp/include/cudf/io/detail/csv.hpp b/cpp/include/cudf/io/detail/csv.hpp
index 0d79ecd0d77..90d730338fc 100644
--- a/cpp/include/cudf/io/detail/csv.hpp
+++ b/cpp/include/cudf/io/detail/csv.hpp
@@ -46,16 +46,16 @@ table_with_metadata read_csv(std::unique_ptr<cudf::io::datasource>&& source,
  *
  * @param sink Output sink
  * @param table The set of columns
- * @param metadata The metadata associated with the table
+ * @param column_names Column names for the output CSV
  * @param options Settings for controlling behavior
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource to use for device memory allocation
  */
 void write_csv(data_sink* sink,
                table_view const& table,
-               const table_metadata* metadata,
+               host_span<std::string const> column_names,
                csv_writer_options const& options,
-               rmm::cuda_stream_view stream        = cudf::default_stream_value,
+               rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace csv
diff --git a/cpp/include/cudf/io/detail/data_casting.cuh b/cpp/include/cudf/io/detail/data_casting.cuh
index 628c00ad603..aba9ec07bc6 100644
--- a/cpp/include/cudf/io/detail/data_casting.cuh
+++ b/cpp/include/cudf/io/detail/data_casting.cuh
@@ -391,10 +391,13 @@ std::unique_ptr<column> parse_data(str_tuple_it str_tuples,
         return;
       }
 
+      // If this is a string value, remove quotes
+      auto [in_begin, in_end] = trim_quotes(in.first, in.first + in.second, options.quotechar);
+
       auto const is_parsed = cudf::type_dispatcher(col_type,
                                                    ConvertFunctor{},
-                                                   in.first,
-                                                   in.first + in.second,
+                                                   in_begin,
+                                                   in_end,
                                                    col.data<char>(),
                                                    row,
                                                    col_type,
diff --git a/cpp/include/cudf/io/detail/json.hpp b/cpp/include/cudf/io/detail/json.hpp
index 3e69ef8a3b8..42717fe36df 100644
--- a/cpp/include/cudf/io/detail/json.hpp
+++ b/cpp/include/cudf/io/detail/json.hpp
@@ -39,7 +39,7 @@ namespace json {
 table_with_metadata read_json(
   std::vector<std::unique_ptr<cudf::io::datasource>>& sources,
   json_reader_options const& options,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace json
diff --git a/cpp/include/cudf/io/detail/orc.hpp b/cpp/include/cudf/io/detail/orc.hpp
index 79fcf4bd916..4c78502a21b 100644
--- a/cpp/include/cudf/io/detail/orc.hpp
+++ b/cpp/include/cudf/io/detail/orc.hpp
@@ -74,8 +74,7 @@ class reader {
    *
    * @return The set of columns along with table metadata
    */
-  table_with_metadata read(orc_reader_options const& options,
-                           rmm::cuda_stream_view stream = cudf::default_stream_value);
+  table_with_metadata read(orc_reader_options const& options, rmm::cuda_stream_view stream);
 };
 
 /**
diff --git a/cpp/include/cudf/io/detail/parquet.hpp b/cpp/include/cudf/io/detail/parquet.hpp
index 7675dc70cb2..7f107017864 100644
--- a/cpp/include/cudf/io/detail/parquet.hpp
+++ b/cpp/include/cudf/io/detail/parquet.hpp
@@ -30,25 +30,28 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
+namespace cudf::io {
 
 // Forward declaration
 class parquet_reader_options;
 class parquet_writer_options;
 class chunked_parquet_writer_options;
 
-namespace detail {
-namespace parquet {
+namespace detail::parquet {
 
 /**
  * @brief Class to read Parquet dataset data into columns.
  */
 class reader {
- private:
+ protected:
   class impl;
   std::unique_ptr<impl> _impl;
 
+  /**
+   * @brief Default constructor, needed for subclassing.
+   */
+  reader();
+
  public:
   /**
    * @brief Constructor from an array of datasources
@@ -66,7 +69,7 @@ class reader {
   /**
    * @brief Destructor explicitly-declared to avoid inlined in header
    */
-  ~reader();
+  virtual ~reader();
 
   /**
    * @brief Reads the dataset as per given options.
@@ -78,6 +81,62 @@ class reader {
   table_with_metadata read(parquet_reader_options const& options);
 };
 
+/**
+ * @brief The reader class that supports iterative reading of a given file.
+ *
+ * This class intentionally subclasses the `reader` class with private inheritance to hide the
+ * `reader::read()` API. As such, only chunked reading APIs are supported.
+ */
+class chunked_reader : private reader {
+ public:
+  /**
+   * @brief Constructor from a read size limit and an array of data sources with reader options.
+   *
+   * The typical usage should be similar to this:
+   * ```
+   *  do {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  } while (reader.has_next());
+   *
+   * ```
+   *
+   * If `chunk_read_limit == 0` (i.e., no reading limit), a call to `read_chunk()` will read the
+   * whole file and return a table containing all rows.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param sources Input `datasource` objects to read the dataset from
+   * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches.
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit chunked_reader(std::size_t chunk_read_limit,
+                          std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
+                          parquet_reader_options const& options,
+                          rmm::cuda_stream_view stream,
+                          rmm::mr::device_memory_resource* mr);
+
+  /**
+   * @brief Destructor explicitly-declared to avoid inlined in header.
+   *
+   * Since the declaration of the internal `_impl` object does not exist in this header, this
+   * destructor needs to be defined in a separate source file which can access to that object's
+   * declaration.
+   */
+  ~chunked_reader();
+
+  /**
+   * @copydoc cudf::io::chunked_parquet_reader::has_next
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @copydoc cudf::io::chunked_parquet_reader::read_chunk
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
+};
+
 /**
  * @brief Class to write parquet dataset data into columns.
  */
@@ -154,7 +213,5 @@ class writer {
     const std::vector<std::unique_ptr<std::vector<uint8_t>>>& metadata_list);
 };
 
-};  // namespace parquet
-};  // namespace detail
-};  // namespace io
-};  // namespace cudf
+}  // namespace detail::parquet
+}  // namespace cudf::io
diff --git a/cpp/include/cudf/io/orc.hpp b/cpp/include/cudf/io/orc.hpp
index 7f3cb95e4b2..b1e2197a868 100644
--- a/cpp/include/cudf/io/orc.hpp
+++ b/cpp/include/cudf/io/orc.hpp
@@ -378,9 +378,6 @@ class orc_reader_options_builder {
  *  auto result  = cudf::io::read_orc(options);
  * @endcode
  *
- * Note: Support for reading files with struct columns is currently experimental, the output may not
- * be as reliable as reading for other datatypes.
- *
  * @param options Settings for controlling reading behavior
  * @param mr Device memory resource used to allocate device memory of the table in the returned
  * table_with_metadata.
@@ -783,9 +780,6 @@ class orc_writer_options_builder {
  *  cudf::io::write_orc(options);
  * @endcode
  *
- * Note: Support for writing tables with struct columns is currently experimental, the output may
- * not be as reliable as writing for other datatypes.
- *
  * @param options Settings for controlling reading behavior
  * @param mr Device memory resource to use for device memory allocation
  */
diff --git a/cpp/include/cudf/io/orc_metadata.hpp b/cpp/include/cudf/io/orc_metadata.hpp
index d974eaa103a..6ef7ea49c59 100644
--- a/cpp/include/cudf/io/orc_metadata.hpp
+++ b/cpp/include/cudf/io/orc_metadata.hpp
@@ -21,6 +21,7 @@
 
 #pragma once
 
+#include <cudf/io/orc_types.hpp>
 #include <cudf/io/types.hpp>
 
 #include <optional>
@@ -180,7 +181,7 @@ struct column_statistics {
    *
    * @param detail_statistics The statistics to initialize the object with
    */
-  column_statistics(cudf::io::orc::column_statistics&& detail_statistics);
+  column_statistics(orc::column_statistics&& detail_statistics);
 };
 
 /**
@@ -207,5 +208,166 @@ struct parsed_orc_statistics {
  */
 parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info);
 
+/**
+ * @brief Schema of an ORC column, including the nested columns.
+ */
+struct orc_column_schema {
+ public:
+  /**
+   * @brief constructor
+   *
+   * @param name column name
+   * @param type ORC type
+   * @param children child columns (empty for non-nested types)
+   */
+  orc_column_schema(std::string_view name,
+                    orc::TypeKind type,
+                    std::vector<orc_column_schema> children)
+    : _name{name}, _type_kind{type}, _children{std::move(children)}
+  {
+  }
+
+  /**
+   * @brief Returns ORC column name; can be empty.
+   *
+   * @return Column name
+   */
+  [[nodiscard]] auto name() const { return _name; }
+
+  /**
+   * @brief Returns ORC type of the column.
+   *
+   * @return Column ORC type
+   */
+  [[nodiscard]] auto type_kind() const { return _type_kind; }
+
+  /**
+   * @brief Returns schemas of all child columns.
+   *
+   * @return Children schemas
+   */
+  [[nodiscard]] auto const& children() const& { return _children; }
+
+  /** @copydoc children
+   * Children array is moved out of the object (rvalues only).
+   *
+   */
+  [[nodiscard]] auto children() && { return std::move(_children); }
+
+  /**
+   * @brief Returns schema of the child with the given index.
+   *
+   * @param idx child index
+   *
+   * @return Child schema
+   */
+  [[nodiscard]] auto const& child(int idx) const& { return children().at(idx); }
+
+  /** @copydoc child
+   * Child is moved out of the object (rvalues only).
+   *
+   */
+  [[nodiscard]] auto child(int idx) && { return std::move(children().at(idx)); }
+
+  /**
+   * @brief Returns the number of child columns.
+   *
+   * @return Children count
+   */
+  [[nodiscard]] auto num_children() const { return children().size(); }
+
+ private:
+  std::string _name;
+  orc::TypeKind _type_kind;
+  std::vector<orc_column_schema> _children;
+};
+
+/**
+ * @brief Schema of an ORC file.
+ */
+struct orc_schema {
+ public:
+  /**
+   * @brief constructor
+   *
+   * @param root_column_schema root column
+   */
+  orc_schema(orc_column_schema root_column_schema) : _root{std::move(root_column_schema)} {}
+
+  /**
+   * @brief Returns the schema of the struct column that contains all columns as fields.
+   *
+   * @return Root column schema
+   */
+  [[nodiscard]] auto const& root() const& { return _root; }
+
+  /** @copydoc root
+   * Root column schema is moved out of the object (rvalues only).
+   *
+   */
+  [[nodiscard]] auto root() && { return std::move(_root); }
+
+ private:
+  orc_column_schema _root;
+};
+
+/**
+ * @brief Information about content of an ORC file.
+ */
+class orc_metadata {
+ public:
+  /**
+   * @brief constructor
+   *
+   * @param schema ORC schema
+   * @param num_rows number of rows
+   * @param num_stripes number of stripes
+   */
+  orc_metadata(orc_schema schema, size_type num_rows, size_type num_stripes)
+    : _schema{std::move(schema)}, _num_rows{num_rows}, _num_stripes{num_stripes}
+  {
+  }
+
+  /**
+   * @brief Returns the ORC schema.
+   *
+   * @return ORC schema
+   */
+  [[nodiscard]] auto const& schema() const { return _schema; }
+
+  ///< Number of rows in the root column; can vary for nested columns
+  /**
+   * @brief Returns the number of rows of the root column.
+   *
+   * If a file contains list columns, nested columns can have a different number of rows.
+   *
+   * @return Number of rows
+   */
+  [[nodiscard]] auto num_rows() const { return _num_rows; }
+
+  /**
+   * @brief Returns the number of stripes in the file.
+   *
+   * @return Number of stripes
+   */
+  [[nodiscard]] auto num_stripes() const { return _num_stripes; }
+
+ private:
+  orc_schema _schema;
+  size_type _num_rows;
+  size_type _num_stripes;
+};
+
+/**
+ * @brief Reads file-level and stripe-level statistics of ORC dataset.
+ *
+ * @ingroup io_readers
+ *
+ * @param src_info Dataset source
+ *
+ * @return Column names and decoded ORC statistics
+ */
+orc_metadata read_orc_metadata(source_info const& src_info);
+
 }  // namespace io
 }  // namespace cudf
diff --git a/cpp/src/io/orc/orc_common.hpp b/cpp/include/cudf/io/orc_types.hpp
similarity index 94%
rename from cpp/src/io/orc/orc_common.hpp
rename to cpp/include/cudf/io/orc_types.hpp
index c2898b362a6..09cae2ef06c 100644
--- a/cpp/src/io/orc/orc_common.hpp
+++ b/cpp/include/cudf/io/orc_types.hpp
@@ -18,11 +18,7 @@
 
 #include <cstdint>
 
-namespace cudf {
-namespace io {
-namespace orc {
-
-static constexpr uint32_t block_header_size = 3;
+namespace cudf::io::orc {
 
 enum CompressionKind : uint8_t {
   NONE   = 0,
@@ -87,6 +83,4 @@ enum ProtofType : uint8_t {
   INVALID_7   = 7,
 };
 
-}  // namespace orc
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::orc
diff --git a/cpp/include/cudf/io/parquet.hpp b/cpp/include/cudf/io/parquet.hpp
index ff5b9f5c457..f3facae098d 100644
--- a/cpp/include/cudf/io/parquet.hpp
+++ b/cpp/include/cudf/io/parquet.hpp
@@ -30,8 +30,7 @@
 #include <string>
 #include <vector>
 
-namespace cudf {
-namespace io {
+namespace cudf::io {
 /**
  * @addtogroup io_readers
  * @{
@@ -400,6 +399,74 @@ table_with_metadata read_parquet(
   parquet_reader_options const& options,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief The chunked parquet reader class to read Parquet file iteratively in to a series of
+ * tables, chunk by chunk.
+ *
+ * This class is designed to address the reading issue when reading very large Parquet files such
+ * that the sizes of their column exceed the limit that can be stored in cudf column. By reading the
+ * file content by chunks using this class, each chunk is guaranteed to have its sizes stay within
+ * the given limit.
+ */
+class chunked_parquet_reader {
+ public:
+  /**
+   * @brief Default constructor, this should never be used.
+   *
+   * This is added just to satisfy cython.
+   */
+  chunked_parquet_reader() = default;
+
+  /**
+   * @brief Constructor for chunked reader.
+   *
+   * This constructor requires the same `parquet_reader_option` parameter as in
+   * `cudf::read_parquet()`, and an additional parameter to specify the size byte limit of the
+   * output table for each reading.
+   *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param options The options used to read Parquet file
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  chunked_parquet_reader(
+    std::size_t chunk_read_limit,
+    parquet_reader_options const& options,
+    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+  /**
+   * @brief Destructor, destroying the internal reader instance.
+   *
+   * Since the declaration of the internal `reader` object does not exist in this header, this
+   * destructor needs to be defined in a separate source file which can access to that object's
+   * declaration.
+   */
+  ~chunked_parquet_reader();
+
+  /**
+   * @brief Check if there is any data in the given file has not yet read.
+   *
+   * @return A boolean value indicating if there is any data left to read
+   */
+  [[nodiscard]] bool has_next() const;
+
+  /**
+   * @brief Read a chunk of rows in the given Parquet file.
+   *
+   * The sequence of returned tables, if concatenated by their order, guarantees to form a complete
+   * dataset as reading the entire given file at once.
+   *
+   * An empty table will be returned if the given file is empty, or all the data in the file has
+   * been read and returned by the previous calls.
+   *
+   * @return An output `cudf::table` along with its metadata
+   */
+  [[nodiscard]] table_with_metadata read_chunk() const;
+
+ private:
+  std::unique_ptr<cudf::io::detail::parquet::chunked_reader> reader;
+};
+
 /** @} */  // end of group
 /**
  * @addtogroup io_writers
@@ -1452,5 +1519,5 @@ class parquet_chunked_writer {
 };
 
 /** @} */  // end of group
-}  // namespace io
-}  // namespace cudf
+
+}  // namespace cudf::io
diff --git a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
index 6f94fb170a8..f5230863f17 100644
--- a/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
+++ b/cpp/include/cudf/io/text/data_chunk_source_factories.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/io/datasource.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/utilities/span.hpp>
@@ -25,6 +26,14 @@
 
 namespace cudf::io::text {
 
+/**
+ * @brief Creates a data source capable of producing device-buffered views of a datasource.
+ * @param data the datasource to be exposed as a data chunk source
+ * @return the data chunk source for the provided datasource. It must not outlive the datasource
+ *         used to construct it.
+ */
+std::unique_ptr<data_chunk_source> make_source(datasource& data);
+
 /**
  * @brief Creates a data source capable of producing device-buffered views of the given string.
  * @param data the host data to be exposed as a data chunk source. Its lifetime must be at least as
diff --git a/cpp/include/cudf/io/text/detail/bgzip_utils.hpp b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
new file mode 100644
index 00000000000..627df5f358a
--- /dev/null
+++ b/cpp/include/cudf/io/text/detail/bgzip_utils.hpp
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <zlib.h>
+
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <algorithm>
+#include <array>
+#include <fstream>
+#include <limits>
+
+namespace cudf::io::text::detail::bgzip {
+
+struct header {
+  int block_size;
+  int extra_length;
+  [[nodiscard]] int data_size() const { return block_size - extra_length - 20; }
+};
+
+struct footer {
+  uint32_t crc;
+  uint32_t decompressed_size;
+};
+
+/**
+ * @brief Reads the full BGZIP header from the given input stream. Afterwards, the stream position
+ *        is at the first data byte.
+ *
+ * @param input_stream The input stream
+ * @return The header storing the compressed size and extra subfield length
+ */
+header read_header(std::istream& input_stream);
+
+/**
+ * @brief Reads the full BGZIP footer from the given input stream. Afterwards, the stream position
+ *        is after the last footer byte.
+ *
+ * @param input_stream The input stream
+ * @return The footer storing uncompressed size and CRC32
+ */
+footer read_footer(std::istream& input_stream);
+
+/**
+ * @brief Writes a header for data of the given compressed size to the given stream.
+ *
+ * @param output_stream The output stream
+ * @param compressed_size The size of the compressed data
+ * @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the
+ *                           BGZIP block size subfield
+ * @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield
+ */
+void write_header(std::ostream& output_stream,
+                  uint16_t compressed_size,
+                  host_span<char const> pre_size_subfields,
+                  host_span<char const> post_size_subfields);
+
+/**
+ * @brief Writes a footer for the given uncompressed data to the given stream.
+ *
+ * @param output_stream The output stream
+ * @param data The data for which uncompressed size and CRC32 will be computed and written
+ */
+void write_footer(std::ostream& output_stream, host_span<char const> data);
+
+/**
+ * @brief Writes the given data to the given stream as an uncompressed deflate block with BZGIP
+ *        header and footer.
+ *
+ * @param output_stream The output stream
+ * @param data The uncompressed data
+ * @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the
+ *                           BGZIP block size subfield
+ * @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield
+ */
+void write_uncompressed_block(std::ostream& output_stream,
+                              host_span<char const> data,
+                              host_span<char const> pre_size_subfields  = {},
+                              host_span<char const> post_size_subfields = {});
+
+/**
+ * @brief Writes the given data to the given stream as a compressed deflate block with BZGIP
+ *        header and footer.
+ *
+ * @param output_stream The output stream
+ * @param data The uncompressed data
+ * @param pre_size_subfields Any GZIP extra subfields (need to be valid) to be placed before the
+ *                           BGZIP block size subfield
+ * @param post_size_subfields Any subfields to be placed after the BGZIP block size subfield
+ */
+void write_compressed_block(std::ostream& output_stream,
+                            host_span<char const> data,
+                            host_span<char const> pre_size_subfields  = {},
+                            host_span<char const> post_size_subfields = {});
+
+}  // namespace cudf::io::text::detail::bgzip
diff --git a/cpp/include/cudf/io/text/multibyte_split.hpp b/cpp/include/cudf/io/text/multibyte_split.hpp
index abb966a55bf..a7edc9be0e4 100644
--- a/cpp/include/cudf/io/text/multibyte_split.hpp
+++ b/cpp/include/cudf/io/text/multibyte_split.hpp
@@ -30,11 +30,25 @@ namespace cudf {
 namespace io {
 namespace text {
 
+/**
+ * @brief Parsing options for multibyte_split.
+ */
+struct parse_options {
+  /**
+   * @brief Only rows starting inside this byte range will be part of the output column.
+   */
+  byte_range_info byte_range = create_byte_range_info_max();
+  /**
+   * @brief Whether delimiters at the end of rows should be stripped from the output column
+   */
+  bool strip_delimiters = false;
+};
+
 /**
  * @brief Splits the source text into a strings column using a multiple byte delimiter.
  *
- * Providing a byte range allows multibyte_split to read a whole file, but only return the offsets
- * of delimiters which begin within the range. If thinking in terms of "records", where each
+ * Providing a byte range allows multibyte_split to read a file partially, only returning the
+ * offsets of delimiters which begin within the range. If thinking in terms of "records", where each
  * delimiter dictates the end of a record, all records which begin within the byte range provided
  * will be returned, including any record which may begin in the range but end outside of the
  * range. Records which begin outside of the range will ignored, even if those records end inside
@@ -63,7 +77,7 @@ namespace text {
  *
  * @param source The source string
  * @param delimiter UTF-8 encoded string for which to find offsets in the source
- * @param byte_range range in which to consider offsets relevant
+ * @param options the parsing options to use (including byte range)
  * @param mr Memory resource to use for the device memory allocation
  * @return The strings found by splitting the source by the delimiter within the relevant byte
  * range.
@@ -71,8 +85,14 @@ namespace text {
 std::unique_ptr<cudf::column> multibyte_split(
   data_chunk_source const& source,
   std::string const& delimiter,
-  std::optional<byte_range_info> byte_range = std::nullopt,
-  rmm::mr::device_memory_resource* mr       = rmm::mr::get_current_device_resource());
+  parse_options options               = {},
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
+std::unique_ptr<cudf::column> multibyte_split(
+  data_chunk_source const& source,
+  std::string const& delimiter,
+  std::optional<byte_range_info> byte_range,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::column> multibyte_split(data_chunk_source const& source,
                                               std::string const& delimiter,
diff --git a/cpp/include/cudf/io/types.hpp b/cpp/include/cudf/io/types.hpp
index 838151fbaf9..9c47ed9ea69 100644
--- a/cpp/include/cudf/io/types.hpp
+++ b/cpp/include/cudf/io/types.hpp
@@ -419,7 +419,7 @@ class column_in_metadata {
    * @param name Name of the column
    * @return this for chaining
    */
-  column_in_metadata& set_name(std::string const& name)
+  column_in_metadata& set_name(std::string const& name) noexcept
   {
     _name = name;
     return *this;
@@ -433,7 +433,7 @@ class column_in_metadata {
    * @param nullable Whether this column is nullable
    * @return this for chaining
    */
-  column_in_metadata& set_nullability(bool nullable)
+  column_in_metadata& set_nullability(bool nullable) noexcept
   {
     _nullable = nullable;
     return *this;
@@ -446,7 +446,7 @@ class column_in_metadata {
    *
    * @return this for chaining
    */
-  column_in_metadata& set_list_column_as_map()
+  column_in_metadata& set_list_column_as_map() noexcept
   {
     _list_column_is_map = true;
     return *this;
@@ -460,7 +460,7 @@ class column_in_metadata {
    * @param req True = use int96 physical type. False = use int64 physical type
    * @return this for chaining
    */
-  column_in_metadata& set_int96_timestamps(bool req)
+  column_in_metadata& set_int96_timestamps(bool req) noexcept
   {
     _use_int96_timestamp = req;
     return *this;
@@ -473,7 +473,7 @@ class column_in_metadata {
    * @param precision The integer precision to set for this decimal column
    * @return this for chaining
    */
-  column_in_metadata& set_decimal_precision(uint8_t precision)
+  column_in_metadata& set_decimal_precision(uint8_t precision) noexcept
   {
     _decimal_precision = precision;
     return *this;
@@ -485,7 +485,7 @@ class column_in_metadata {
    * @param field_id The parquet field id to set
    * @return this for chaining
    */
-  column_in_metadata& set_parquet_field_id(int32_t field_id)
+  column_in_metadata& set_parquet_field_id(int32_t field_id) noexcept
   {
     _parquet_field_id = field_id;
     return *this;
@@ -499,7 +499,7 @@ class column_in_metadata {
    * @param binary True = use binary data type. False = use string data type
    * @return this for chaining
    */
-  column_in_metadata& set_output_as_binary(bool binary)
+  column_in_metadata& set_output_as_binary(bool binary) noexcept
   {
     _output_as_binary = binary;
     return *this;
@@ -511,7 +511,7 @@ class column_in_metadata {
    * @param i Index of the child to get
    * @return this for chaining
    */
-  column_in_metadata& child(size_type i) { return children[i]; }
+  column_in_metadata& child(size_type i) noexcept { return children[i]; }
 
   /**
    * @brief Get const reference to a child of this column
@@ -519,21 +519,21 @@ class column_in_metadata {
    * @param i Index of the child to get
    * @return this for chaining
    */
-  [[nodiscard]] column_in_metadata const& child(size_type i) const { return children[i]; }
+  [[nodiscard]] column_in_metadata const& child(size_type i) const noexcept { return children[i]; }
 
   /**
    * @brief Get the name of this column
    *
    * @return The name of this column
    */
-  [[nodiscard]] std::string get_name() const { return _name; }
+  [[nodiscard]] std::string get_name() const noexcept { return _name; }
 
   /**
    * @brief Get whether nullability has been explicitly set for this column.
    *
    * @return Boolean indicating whether nullability has been explicitly set for this column
    */
-  [[nodiscard]] bool is_nullability_defined() const { return _nullable.has_value(); }
+  [[nodiscard]] bool is_nullability_defined() const noexcept { return _nullable.has_value(); }
 
   /**
    * @brief Gets the explicitly set nullability for this column.
@@ -549,7 +549,7 @@ class column_in_metadata {
    *
    * @return Boolean indicating whether this column is to be encoded as a map
    */
-  [[nodiscard]] bool is_map() const { return _list_column_is_map; }
+  [[nodiscard]] bool is_map() const noexcept { return _list_column_is_map; }
 
   /**
    * @brief Get whether to encode this timestamp column using deprecated int96 physical type
@@ -557,14 +557,17 @@ class column_in_metadata {
    * @return Boolean indicating whether to encode this timestamp column using deprecated int96
    *         physical type
    */
-  [[nodiscard]] bool is_enabled_int96_timestamps() const { return _use_int96_timestamp; }
+  [[nodiscard]] bool is_enabled_int96_timestamps() const noexcept { return _use_int96_timestamp; }
 
   /**
    * @brief Get whether precision has been set for this decimal column
    *
    * @return Boolean indicating whether precision has been set for this decimal column
    */
-  [[nodiscard]] bool is_decimal_precision_set() const { return _decimal_precision.has_value(); }
+  [[nodiscard]] bool is_decimal_precision_set() const noexcept
+  {
+    return _decimal_precision.has_value();
+  }
 
   /**
    * @brief Get the decimal precision that was set for this column.
@@ -580,7 +583,10 @@ class column_in_metadata {
    *
    * @return Boolean indicating whether parquet field id has been set for this column
    */
-  [[nodiscard]] bool is_parquet_field_id_set() const { return _parquet_field_id.has_value(); }
+  [[nodiscard]] bool is_parquet_field_id_set() const noexcept
+  {
+    return _parquet_field_id.has_value();
+  }
 
   /**
    * @brief Get the parquet field id that was set for this column.
@@ -596,14 +602,14 @@ class column_in_metadata {
    *
    * @return The number of children of this column
    */
-  [[nodiscard]] size_type num_children() const { return children.size(); }
+  [[nodiscard]] size_type num_children() const noexcept { return children.size(); }
 
   /**
    * @brief Get whether to encode this column as binary or string data
    *
    * @return Boolean indicating whether to encode this column as binary data
    */
-  [[nodiscard]] bool is_enabled_output_as_binary() const { return _output_as_binary; }
+  [[nodiscard]] bool is_enabled_output_as_binary() const noexcept { return _output_as_binary; }
 };
 
 /**
diff --git a/cpp/include/cudf/join.hpp b/cpp/include/cudf/join.hpp
index bc3bfef3a7d..b613a661d95 100644
--- a/cpp/include/cudf/join.hpp
+++ b/cpp/include/cudf/join.hpp
@@ -287,7 +287,7 @@ class hash_join {
    */
   hash_join(cudf::table_view const& build,
             null_equality compare_nulls,
-            rmm::cuda_stream_view stream = cudf::default_stream_value);
+            rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * Returns the row indices that can be used to construct the result of performing
@@ -308,7 +308,7 @@ class hash_join {
             std::unique_ptr<rmm::device_uvector<size_type>>>
   inner_join(cudf::table_view const& probe,
              std::optional<std::size_t> output_size = {},
-             rmm::cuda_stream_view stream           = cudf::default_stream_value,
+             rmm::cuda_stream_view stream           = cudf::get_default_stream(),
              rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
 
   /**
@@ -330,7 +330,7 @@ class hash_join {
             std::unique_ptr<rmm::device_uvector<size_type>>>
   left_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
-            rmm::cuda_stream_view stream           = cudf::default_stream_value,
+            rmm::cuda_stream_view stream           = cudf::get_default_stream(),
             rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
 
   /**
@@ -352,7 +352,7 @@ class hash_join {
             std::unique_ptr<rmm::device_uvector<size_type>>>
   full_join(cudf::table_view const& probe,
             std::optional<std::size_t> output_size = {},
-            rmm::cuda_stream_view stream           = cudf::default_stream_value,
+            rmm::cuda_stream_view stream           = cudf::get_default_stream(),
             rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource()) const;
 
   /**
@@ -366,7 +366,7 @@ class hash_join {
    * `build` and `probe` as the the join keys .
    */
   [[nodiscard]] std::size_t inner_join_size(
-    cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::default_stream_value) const;
+    cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * Returns the exact number of matches (rows) when performing a left join with the specified probe
@@ -379,7 +379,7 @@ class hash_join {
    * and `probe` as the the join keys .
    */
   [[nodiscard]] std::size_t left_join_size(
-    cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::default_stream_value) const;
+    cudf::table_view const& probe, rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * Returns the exact number of matches (rows) when performing a full join with the specified probe
@@ -395,7 +395,7 @@ class hash_join {
    */
   std::size_t full_join_size(
     cudf::table_view const& probe,
-    rmm::cuda_stream_view stream        = cudf::default_stream_value,
+    rmm::cuda_stream_view stream        = cudf::get_default_stream(),
     rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource()) const;
 
  private:
diff --git a/cpp/include/cudf/lists/detail/concatenate.hpp b/cpp/include/cudf/lists/detail/concatenate.hpp
index e2e17579c85..5a8b4bc3bf3 100644
--- a/cpp/include/cudf/lists/detail/concatenate.hpp
+++ b/cpp/include/cudf/lists/detail/concatenate.hpp
@@ -45,7 +45,7 @@ namespace detail {
  */
 std::unique_ptr<column> concatenate(
   host_span<column_view const> columns,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/lists/detail/gather.cuh b/cpp/include/cudf/lists/detail/gather.cuh
index 9cbe9582456..48c0ed8f6e9 100644
--- a/cpp/include/cudf/lists/detail/gather.cuh
+++ b/cpp/include/cudf/lists/detail/gather.cuh
@@ -89,7 +89,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
   // generate the compacted outgoing offsets.
   auto count_iter = thrust::make_counting_iterator<int32_t>(0);
   thrust::transform_exclusive_scan(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     count_iter,
     count_iter + offset_count,
     dst_offsets_v.begin<int32_t>(),
@@ -125,7 +125,7 @@ gather_data make_gather_data(cudf::lists_column_view const& source_column,
   // generate the base offsets
   rmm::device_uvector<int32_t> base_offsets = rmm::device_uvector<int32_t>(output_count, stream);
   thrust::transform(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     gather_map,
     gather_map + output_count,
     base_offsets.data(),
@@ -320,8 +320,9 @@ std::unique_ptr<column> gather_list_leaf(
 std::unique_ptr<column> segmented_gather(
   lists_column_view const& source_column,
   lists_column_view const& gather_map_list,
-  out_of_bounds_policy bounds_policy  = out_of_bounds_policy::DONT_CHECK,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  out_of_bounds_policy bounds_policy = out_of_bounds_policy::DONT_CHECK,
+  // Move before bounds_policy?
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/lists/detail/scatter.cuh b/cpp/include/cudf/lists/detail/scatter.cuh
index 2e60df4a5ae..f4106fb5cdf 100644
--- a/cpp/include/cudf/lists/detail/scatter.cuh
+++ b/cpp/include/cudf/lists/detail/scatter.cuh
@@ -58,7 +58,7 @@ rmm::device_uvector<unbound_list_view> list_vector_from_column(
 
   auto vector = rmm::device_uvector<unbound_list_view>(n_rows, stream, mr);
 
-  thrust::transform(rmm::exec_policy(stream),
+  thrust::transform(rmm::exec_policy_nosync(stream),
                     index_begin,
                     index_end,
                     vector.begin(),
@@ -96,7 +96,7 @@ std::unique_ptr<column> scatter_impl(
   MapIterator scatter_map_end,
   column_view const& source,
   column_view const& target,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(column_types_equal(source, target), "Mismatched column types.");
@@ -104,7 +104,7 @@ std::unique_ptr<column> scatter_impl(
   auto const child_column_type = lists_column_view(target).child().type();
 
   // Scatter.
-  thrust::scatter(rmm::exec_policy(stream),
+  thrust::scatter(rmm::exec_policy_nosync(stream),
                   source_vector.begin(),
                   source_vector.end(),
                   scatter_map_begin,
@@ -169,7 +169,7 @@ std::unique_ptr<column> scatter(
   MapIterator scatter_map_begin,
   MapIterator scatter_map_end,
   column_view const& target,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto const num_rows = target.size();
@@ -226,7 +226,7 @@ std::unique_ptr<column> scatter(
   MapIterator scatter_map_begin,
   MapIterator scatter_map_end,
   column_view const& target,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto const num_rows = target.size();
@@ -239,7 +239,7 @@ std::unique_ptr<column> scatter(
               : cudf::detail::create_null_mask(1, mask_state::ALL_NULL, stream, mr);
   auto offset_column = make_numeric_column(
     data_type{type_to_id<offset_type>()}, 2, mask_state::UNALLOCATED, stream, mr);
-  thrust::sequence(rmm::exec_policy(stream),
+  thrust::sequence(rmm::exec_policy_nosync(stream),
                    offset_column->mutable_view().begin<offset_type>(),
                    offset_column->mutable_view().end<offset_type>(),
                    0,
diff --git a/cpp/include/cudf/lists/lists_column_factories.hpp b/cpp/include/cudf/lists/lists_column_factories.hpp
index 2b40a875cc9..a6eacb97e91 100644
--- a/cpp/include/cudf/lists/lists_column_factories.hpp
+++ b/cpp/include/cudf/lists/lists_column_factories.hpp
@@ -38,7 +38,7 @@ namespace detail {
 std::unique_ptr<cudf::column> make_lists_column_from_scalar(
   list_scalar const& value,
   size_type size,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/include/cudf/partitioning.hpp b/cpp/include/cudf/partitioning.hpp
index 6e9f571cc9d..2c91bdf64f5 100644
--- a/cpp/include/cudf/partitioning.hpp
+++ b/cpp/include/cudf/partitioning.hpp
@@ -98,7 +98,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition(
   int num_partitions,
   hash_id hash_function               = hash_id::HASH_MURMUR3,
   uint32_t seed                       = DEFAULT_HASH_SEED,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
diff --git a/cpp/include/cudf/quantiles.hpp b/cpp/include/cudf/quantiles.hpp
index 531c7e3477d..1f3c26fa077 100644
--- a/cpp/include/cudf/quantiles.hpp
+++ b/cpp/include/cudf/quantiles.hpp
@@ -18,7 +18,7 @@
 
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/table/table_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/types.hpp>
 
 #include <rmm/mr/device/per_device_resource.hpp>
diff --git a/cpp/include/cudf/reduction.hpp b/cpp/include/cudf/reduction.hpp
index 083892aa856..7aa7ada6896 100644
--- a/cpp/include/cudf/reduction.hpp
+++ b/cpp/include/cudf/reduction.hpp
@@ -72,7 +72,7 @@ enum class scan_type : bool { INCLUSIVE, EXCLUSIVE };
  */
 std::unique_ptr<scalar> reduce(
   column_view const& col,
-  std::unique_ptr<reduce_aggregation> const& agg,
+  reduce_aggregation const& agg,
   data_type output_dtype,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
@@ -89,7 +89,7 @@ std::unique_ptr<scalar> reduce(
  */
 std::unique_ptr<scalar> reduce(
   column_view const& col,
-  std::unique_ptr<reduce_aggregation> const& agg,
+  reduce_aggregation const& agg,
   data_type output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
@@ -187,7 +187,7 @@ std::unique_ptr<column> segmented_reduce(
  */
 std::unique_ptr<column> scan(
   const column_view& input,
-  std::unique_ptr<scan_aggregation> const& agg,
+  scan_aggregation const& agg,
   scan_type inclusive,
   null_policy null_handling           = null_policy::EXCLUDE,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
diff --git a/cpp/include/cudf/scalar/scalar.hpp b/cpp/include/cudf/scalar/scalar.hpp
index 9b9c73071af..6161639a6fb 100644
--- a/cpp/include/cudf/scalar/scalar.hpp
+++ b/cpp/include/cudf/scalar/scalar.hpp
@@ -64,7 +64,7 @@ class scalar {
    * @param is_valid true: set the value to valid. false: set it to null.
    * @param stream CUDA stream used for device memory operations.
    */
-  void set_valid_async(bool is_valid, rmm::cuda_stream_view stream = cudf::default_stream_value);
+  void set_valid_async(bool is_valid, rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Indicates whether the scalar contains a valid value.
@@ -76,7 +76,7 @@ class scalar {
    * @return true Value is valid
    * @return false Value is invalid/null
    */
-  [[nodiscard]] bool is_valid(rmm::cuda_stream_view stream = cudf::default_stream_value) const;
+  [[nodiscard]] bool is_valid(rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * @brief Returns a raw pointer to the validity bool in device memory.
@@ -112,7 +112,7 @@ class scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   scalar(scalar const& other,
-         rmm::cuda_stream_view stream        = cudf::default_stream_value,
+         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -128,7 +128,7 @@ class scalar {
    */
   scalar(data_type type,
          bool is_valid                       = false,
-         rmm::cuda_stream_view stream        = cudf::default_stream_value,
+         rmm::cuda_stream_view stream        = cudf::get_default_stream(),
          rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 };
 
@@ -164,7 +164,7 @@ class fixed_width_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_width_scalar(fixed_width_scalar const& other,
-                     rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -173,7 +173,7 @@ class fixed_width_scalar : public scalar {
    * @param value New value of scalar.
    * @param stream CUDA stream used for device memory operations.
    */
-  void set_value(T value, rmm::cuda_stream_view stream = cudf::default_stream_value);
+  void set_value(T value, rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   /**
    * @brief Explicit conversion operator to get the value of the scalar on the host.
@@ -186,7 +186,7 @@ class fixed_width_scalar : public scalar {
    * @param stream CUDA stream used for device memory operations.
    * @return Value of the scalar
    */
-  T value(rmm::cuda_stream_view stream = cudf::default_stream_value) const;
+  T value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * @brief Returns a raw pointer to the value in device memory.
@@ -215,7 +215,7 @@ class fixed_width_scalar : public scalar {
    */
   fixed_width_scalar(T value,
                      bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -228,7 +228,7 @@ class fixed_width_scalar : public scalar {
    */
   fixed_width_scalar(rmm::device_scalar<T>&& data,
                      bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 };
 
@@ -264,7 +264,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   numeric_scalar(numeric_scalar const& other,
-                 rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -277,7 +277,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    */
   numeric_scalar(T value,
                  bool is_valid                       = true,
-                 rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -290,7 +290,7 @@ class numeric_scalar : public detail::fixed_width_scalar<T> {
    */
   numeric_scalar(rmm::device_scalar<T>&& data,
                  bool is_valid                       = true,
-                 rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                 rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 };
 
@@ -327,7 +327,7 @@ class fixed_point_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   fixed_point_scalar(fixed_point_scalar const& other,
-                     rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -342,7 +342,7 @@ class fixed_point_scalar : public scalar {
   fixed_point_scalar(rep_type value,
                      numeric::scale_type scale,
                      bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -355,7 +355,7 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(rep_type value,
                      bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -368,7 +368,7 @@ class fixed_point_scalar : public scalar {
    */
   fixed_point_scalar(T value,
                      bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -383,7 +383,7 @@ class fixed_point_scalar : public scalar {
   fixed_point_scalar(rmm::device_scalar<rep_type>&& data,
                      numeric::scale_type scale,
                      bool is_valid                       = true,
-                     rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                     rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                      rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -392,7 +392,7 @@ class fixed_point_scalar : public scalar {
    * @param stream CUDA stream used for device memory operations.
    * @return The value of the scalar
    */
-  rep_type value(rmm::cuda_stream_view stream = cudf::default_stream_value) const;
+  rep_type value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * @brief Get the decimal32, decimal64 or decimal128.
@@ -400,7 +400,7 @@ class fixed_point_scalar : public scalar {
    * @param stream CUDA stream used for device memory operations.
    * @return The decimal32, decimal64 or decimal128 value
    */
-  T fixed_point_value(rmm::cuda_stream_view stream = cudf::default_stream_value) const;
+  T fixed_point_value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * @brief Explicit conversion operator to get the value of the scalar on the host.
@@ -451,7 +451,7 @@ class string_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   string_scalar(string_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -464,7 +464,7 @@ class string_scalar : public scalar {
    */
   string_scalar(std::string const& string,
                 bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -479,7 +479,7 @@ class string_scalar : public scalar {
    */
   string_scalar(value_type const& source,
                 bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -494,7 +494,7 @@ class string_scalar : public scalar {
    */
   string_scalar(rmm::device_scalar<value_type>& data,
                 bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -510,7 +510,7 @@ class string_scalar : public scalar {
    */
   string_scalar(rmm::device_buffer&& data,
                 bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -525,7 +525,7 @@ class string_scalar : public scalar {
    * @return The value of the scalar in a host std::string
    */
   [[nodiscard]] std::string to_string(
-    rmm::cuda_stream_view stream = cudf::default_stream_value) const;
+    rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * @brief Get the value of the scalar as a string_view.
@@ -533,7 +533,7 @@ class string_scalar : public scalar {
    * @param stream CUDA stream used for device memory operations.
    * @return The value of the scalar as a string_view
    */
-  [[nodiscard]] value_type value(rmm::cuda_stream_view stream = cudf::default_stream_value) const;
+  [[nodiscard]] value_type value(rmm::cuda_stream_view stream = cudf::get_default_stream()) const;
 
   /**
    * @brief Returns the size of the string in bytes.
@@ -582,7 +582,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   chrono_scalar(chrono_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -595,7 +595,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    */
   chrono_scalar(T value,
                 bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -608,7 +608,7 @@ class chrono_scalar : public detail::fixed_width_scalar<T> {
    */
   chrono_scalar(rmm::device_scalar<T>&& data,
                 bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 };
 
@@ -641,7 +641,7 @@ class timestamp_scalar : public chrono_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   timestamp_scalar(timestamp_scalar const& other,
-                   rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -657,7 +657,7 @@ class timestamp_scalar : public chrono_scalar<T> {
   template <typename Duration2>
   timestamp_scalar(Duration2 const& value,
                    bool is_valid,
-                   rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                   rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                    rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -696,7 +696,7 @@ class duration_scalar : public chrono_scalar<T> {
    * @param mr Device memory resource to use for device memory allocation.
    */
   duration_scalar(duration_scalar const& other,
-                  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -709,7 +709,7 @@ class duration_scalar : public chrono_scalar<T> {
    */
   duration_scalar(rep_type value,
                   bool is_valid,
-                  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -744,7 +744,7 @@ class list_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   list_scalar(list_scalar const& other,
-              rmm::cuda_stream_view stream        = cudf::default_stream_value,
+              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -759,7 +759,7 @@ class list_scalar : public scalar {
    */
   list_scalar(cudf::column_view const& data,
               bool is_valid                       = true,
-              rmm::cuda_stream_view stream        = cudf::default_stream_value,
+              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -772,7 +772,7 @@ class list_scalar : public scalar {
    */
   list_scalar(cudf::column&& data,
               bool is_valid                       = true,
-              rmm::cuda_stream_view stream        = cudf::default_stream_value,
+              rmm::cuda_stream_view stream        = cudf::get_default_stream(),
               rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -809,7 +809,7 @@ class struct_scalar : public scalar {
    * @param mr Device memory resource to use for device memory allocation.
    */
   struct_scalar(struct_scalar const& other,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -824,7 +824,7 @@ class struct_scalar : public scalar {
    */
   struct_scalar(table_view const& data,
                 bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -839,7 +839,7 @@ class struct_scalar : public scalar {
    */
   struct_scalar(host_span<column_view const> data,
                 bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -855,7 +855,7 @@ class struct_scalar : public scalar {
    */
   struct_scalar(table&& data,
                 bool is_valid                       = true,
-                rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
diff --git a/cpp/include/cudf/scalar/scalar_factories.hpp b/cpp/include/cudf/scalar/scalar_factories.hpp
index b2b52ddc488..78b6c4fd0e9 100644
--- a/cpp/include/cudf/scalar/scalar_factories.hpp
+++ b/cpp/include/cudf/scalar/scalar_factories.hpp
@@ -43,7 +43,7 @@ namespace cudf {
  */
 std::unique_ptr<scalar> make_numeric_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -60,7 +60,7 @@ std::unique_ptr<scalar> make_numeric_scalar(
  */
 std::unique_ptr<scalar> make_timestamp_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -77,7 +77,7 @@ std::unique_ptr<scalar> make_timestamp_scalar(
  */
 std::unique_ptr<scalar> make_duration_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -94,7 +94,7 @@ std::unique_ptr<scalar> make_duration_scalar(
  */
 std::unique_ptr<scalar> make_fixed_width_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -111,7 +111,7 @@ std::unique_ptr<scalar> make_fixed_width_scalar(
  */
 std::unique_ptr<scalar> make_string_scalar(
   std::string const& string,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -126,7 +126,7 @@ std::unique_ptr<scalar> make_string_scalar(
  */
 std::unique_ptr<scalar> make_default_constructed_scalar(
   data_type type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -141,7 +141,7 @@ std::unique_ptr<scalar> make_default_constructed_scalar(
  */
 std::unique_ptr<scalar> make_empty_scalar_like(
   column_view const& input,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -156,7 +156,7 @@ std::unique_ptr<scalar> make_empty_scalar_like(
 template <typename T>
 std::unique_ptr<scalar> make_fixed_width_scalar(
   T value,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   return std::make_unique<scalar_type_t<T>>(value, true, stream, mr);
@@ -176,7 +176,7 @@ template <typename T>
 std::unique_ptr<scalar> make_fixed_point_scalar(
   typename T::rep value,
   numeric::scale_type scale,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   return std::make_unique<scalar_type_t<T>>(value, scale, true, stream, mr);
@@ -192,7 +192,7 @@ std::unique_ptr<scalar> make_fixed_point_scalar(
  */
 std::unique_ptr<scalar> make_list_scalar(
   column_view elements,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -207,7 +207,7 @@ std::unique_ptr<scalar> make_list_scalar(
  */
 std::unique_ptr<scalar> make_struct_scalar(
   table_view const& data,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -222,7 +222,7 @@ std::unique_ptr<scalar> make_struct_scalar(
  */
 std::unique_ptr<scalar> make_struct_scalar(
   host_span<column_view const> data,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /** @} */  // end of group
diff --git a/cpp/include/cudf/sorting.hpp b/cpp/include/cudf/sorting.hpp
index cf21da1b030..f43089210fd 100644
--- a/cpp/include/cudf/sorting.hpp
+++ b/cpp/include/cudf/sorting.hpp
@@ -207,9 +207,31 @@ std::unique_ptr<column> rank(
 /**
  * @brief Returns sorted order after sorting each segment in the table.
  *
- * If segment_offsets contains values larger than number of rows, behavior is undefined.
+ * If segment_offsets contains values larger than the number of rows, the behavior is undefined.
  * @throws cudf::logic_error if `segment_offsets` is not `size_type` column.
  *
+ * @code{.pseudo}
+ * Example:
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * offsets = {0, 3, 7, 10}
+ * result = cudf::segmented_sorted_order(keys, offsets);
+ * result is { 2,1,0, 6,5,4,3, 9,8,7 }
+ * @endcode
+ *
+ * If segment_offsets is empty or contains a single index, no values are sorted
+ * and the result is a sequence of integers from 0 to keys.size()-1.
+ *
+ * The segment_offsets are not required to include all indices. Any indices
+ * outside the specified segments will not be sorted.
+ *
+ * @code{.pseudo}
+ * Example: (offsets do not cover all indices)
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * offsets = {3, 7}
+ * result = cudf::segmented_sorted_order(keys, offsets);
+ * result is { 0,1,2, 6,5,4,3, 7,8,9 }
+ * @endcode
+ *
  * @param keys The table that determines the ordering of elements in each segment
  * @param segment_offsets The column of `size_type` type containing start offset index for each
  * contiguous segment.
@@ -246,10 +268,34 @@ std::unique_ptr<column> stable_segmented_sorted_order(
 /**
  * @brief Performs a lexicographic segmented sort of a table
  *
- * If segment_offsets contains values larger than number of rows, behavior is undefined.
+ * If segment_offsets contains values larger than the number of rows, the behavior is undefined.
  * @throws cudf::logic_error if `values.num_rows() != keys.num_rows()`.
  * @throws cudf::logic_error if `segment_offsets` is not `size_type` column.
  *
+ * @code{.pseudo}
+ * Example:
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * values = { {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'} }
+ * offsets = {0, 3, 7, 10}
+ * result = cudf::segmented_sort_by_key(keys, values, offsets);
+ * result is { 'c','b','a', 'g','f','e','d', 'j','i','h' }
+ * @endcode
+ *
+ * If segment_offsets is empty or contains a single index, no values are sorted
+ * and the result is a copy of the values.
+ *
+ * The segment_offsets are not required to include all indices. Any indices
+ * outside the specified segments will not be sorted.
+ *
+ * @code{.pseudo}
+ * Example: (offsets do not cover all indices)
+ * keys = { {9, 8, 7, 6, 5, 4, 3, 2, 1, 0} }
+ * values = { {'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j'} }
+ * offsets = {3, 7}
+ * result = cudf::segmented_sort_by_key(keys, values, offsets);
+ * result is { 'a','b','c', 'g','f','e','d', 'h','i','j' }
+ * @endcode
+ *
  * @param values The table to reorder
  * @param keys The table that determines the ordering of elements in each segment
  * @param segment_offsets The column of `size_type` type containing start offset index for each
diff --git a/cpp/include/cudf/strings/contains.hpp b/cpp/include/cudf/strings/contains.hpp
index d95dc2c418c..1718d205871 100644
--- a/cpp/include/cudf/strings/contains.hpp
+++ b/cpp/include/cudf/strings/contains.hpp
@@ -24,6 +24,9 @@
 
 namespace cudf {
 namespace strings {
+
+struct regex_program;
+
 /**
  * @addtogroup strings_contains
  * @{
@@ -58,6 +61,32 @@ std::unique_ptr<column> contains_re(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying rows which
+ * match the given regex_program object
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["abc", "123", "def456"]
+ * p = regex_program::create("\\d+")
+ * r = contains_re(s, p)
+ * r is now [false, true, true]
+ * @endcode
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
+ */
+std::unique_ptr<column> contains_re(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a boolean column identifying rows which
  * matching the given regex pattern but only at the beginning the string.
@@ -85,6 +114,32 @@ std::unique_ptr<column> matches_re(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a boolean column identifying rows which
+ * matching the given regex_program object but only at the beginning the string.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["abc", "123", "def456"]
+ * p = regex_program::create("\\d+")
+ * r = matches_re(s, p)
+ * r is now [false, true, false]
+ * @endcode
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New column of boolean results for each string
+ */
+std::unique_ptr<column> matches_re(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns the number of times the given regex pattern
  * matches in each string.
@@ -112,6 +167,32 @@ std::unique_ptr<column> count_re(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns the number of times the given regex_program's pattern
+ * matches in each string
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["abc", "123", "def45"]
+ * p = regex_program::create("\\d")
+ * r = count_re(s, p)
+ * r is now [0, 3, 2]
+ * @endcode
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New INT32 column with counts for each string
+ */
+std::unique_ptr<column> count_re(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a boolean column identifying rows which
  * match the given like pattern.
diff --git a/cpp/include/cudf/strings/detail/char_tables.hpp b/cpp/include/cudf/strings/detail/char_tables.hpp
index 4ea7e3ee952..275b7223a3b 100644
--- a/cpp/include/cudf/strings/detail/char_tables.hpp
+++ b/cpp/include/cudf/strings/detail/char_tables.hpp
@@ -46,6 +46,7 @@ constexpr uint8_t IS_LOWER(uint8_t x) { return ((x) & (1 << 6)); }
 constexpr uint8_t IS_SPECIAL(uint8_t x) { return ((x) & (1 << 7)); }
 constexpr uint8_t IS_ALPHANUM(uint8_t x) { return ((x) & (0x0F)); }
 constexpr uint8_t IS_UPPER_OR_LOWER(uint8_t x) { return ((x) & ((1 << 5) | (1 << 6))); }
+constexpr uint8_t ALL_FLAGS = 0xFF;
 
 // Type for the character cases table.
 using character_cases_table_type = uint16_t;
diff --git a/cpp/include/cudf/strings/detail/combine.hpp b/cpp/include/cudf/strings/detail/combine.hpp
index 7df3a4ce324..3b8ed0f4e0d 100644
--- a/cpp/include/cudf/strings/detail/combine.hpp
+++ b/cpp/include/cudf/strings/detail/combine.hpp
@@ -34,13 +34,12 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> concatenate(
-  table_view const& strings_columns,
-  string_scalar const& separator,
-  string_scalar const& narep,
-  separator_on_nulls separate_nulls   = separator_on_nulls::YES,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(table_view const& strings_columns,
+                                    string_scalar const& separator,
+                                    string_scalar const& narep,
+                                    separator_on_nulls separate_nulls,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc join_strings(table_view const&,string_scalar const&,string_scalar
@@ -48,12 +47,11 @@ std::unique_ptr<column> concatenate(
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> join_strings(
-  strings_column_view const& strings,
-  string_scalar const& separator,
-  string_scalar const& narep,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> join_strings(strings_column_view const& strings,
+                                     string_scalar const& separator,
+                                     string_scalar const& narep,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc join_list_elements(table_view const&,string_scalar const&,string_scalar
diff --git a/cpp/include/cudf/strings/detail/concatenate.hpp b/cpp/include/cudf/strings/detail/concatenate.hpp
index 0df86db60b6..511e240886a 100644
--- a/cpp/include/cudf/strings/detail/concatenate.hpp
+++ b/cpp/include/cudf/strings/detail/concatenate.hpp
@@ -42,10 +42,9 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New column with concatenated results.
  */
-std::unique_ptr<column> concatenate(
-  host_span<column_view const> columns,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> concatenate(host_span<column_view const> columns,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/copy_if_else.cuh b/cpp/include/cudf/strings/detail/copy_if_else.cuh
index 79cec779e02..374c3b2cf68 100644
--- a/cpp/include/cudf/strings/detail/copy_if_else.cuh
+++ b/cpp/include/cudf/strings/detail/copy_if_else.cuh
@@ -56,13 +56,12 @@ namespace detail {
  * @return New strings column.
  */
 template <typename StringIterLeft, typename StringIterRight, typename Filter>
-std::unique_ptr<cudf::column> copy_if_else(
-  StringIterLeft lhs_begin,
-  StringIterLeft lhs_end,
-  StringIterRight rhs_begin,
-  Filter filter_fn,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::column> copy_if_else(StringIterLeft lhs_begin,
+                                           StringIterLeft lhs_end,
+                                           StringIterRight rhs_begin,
+                                           Filter filter_fn,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = std::distance(lhs_begin, lhs_end);
   if (strings_count == 0) return make_empty_column(type_id::STRING);
diff --git a/cpp/include/cudf/strings/detail/copy_range.cuh b/cpp/include/cudf/strings/detail/copy_range.cuh
index e83f6dc0005..ee09ce9a7a9 100644
--- a/cpp/include/cudf/strings/detail/copy_range.cuh
+++ b/cpp/include/cudf/strings/detail/copy_range.cuh
@@ -99,14 +99,13 @@ namespace detail {
  * @return std::unique_ptr<column> The result target column
  */
 template <typename SourceValueIterator, typename SourceValidityIterator>
-std::unique_ptr<column> copy_range(
-  SourceValueIterator source_value_begin,
-  SourceValidityIterator source_validity_begin,
-  strings_column_view const& target,
-  size_type target_begin,
-  size_type target_end,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> copy_range(SourceValueIterator source_value_begin,
+                                   SourceValidityIterator source_validity_begin,
+                                   strings_column_view const& target,
+                                   size_type target_begin,
+                                   size_type target_end,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(
     (target_begin >= 0) && (target_begin < target.size()) && (target_end <= target.size()),
diff --git a/cpp/include/cudf/strings/detail/copying.hpp b/cpp/include/cudf/strings/detail/copying.hpp
index 56e9c35c889..7e82ad4c679 100644
--- a/cpp/include/cudf/strings/detail/copying.hpp
+++ b/cpp/include/cudf/strings/detail/copying.hpp
@@ -49,12 +49,11 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column of size (end-start)/step.
  */
-std::unique_ptr<cudf::column> copy_slice(
-  strings_column_view const& strings,
-  size_type start,
-  size_type end                       = -1,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> copy_slice(strings_column_view const& strings,
+                                         size_type start,
+                                         size_type end,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Returns a new strings column created by shifting the rows by a specified offset.
diff --git a/cpp/include/cudf/strings/detail/fill.hpp b/cpp/include/cudf/strings/detail/fill.hpp
index e8f9c9ca438..43e3f6198f3 100644
--- a/cpp/include/cudf/strings/detail/fill.hpp
+++ b/cpp/include/cudf/strings/detail/fill.hpp
@@ -42,13 +42,12 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
-std::unique_ptr<column> fill(
-  strings_column_view const& strings,
-  size_type begin,
-  size_type end,
-  string_scalar const& value,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> fill(strings_column_view const& strings,
+                             size_type begin,
+                             size_type end,
+                             string_scalar const& value,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/gather.cuh b/cpp/include/cudf/strings/detail/gather.cuh
index dfc8f0dacc5..28b98eac3b5 100644
--- a/cpp/include/cudf/strings/detail/gather.cuh
+++ b/cpp/include/cudf/strings/detail/gather.cuh
@@ -288,12 +288,11 @@ std::unique_ptr<cudf::column> gather_chars(StringIterator strings_begin,
  * @return New strings column containing the gathered strings.
  */
 template <bool NullifyOutOfBounds, typename MapIterator>
-std::unique_ptr<cudf::column> gather(
-  strings_column_view const& strings,
-  MapIterator begin,
-  MapIterator end,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
+                                     MapIterator begin,
+                                     MapIterator end,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   auto const output_count  = std::distance(begin, end);
   auto const strings_count = strings.size();
@@ -306,7 +305,7 @@ std::unique_ptr<cudf::column> gather(
   auto const d_in_offsets  = (strings_count > 0) ? strings.offsets_begin() : nullptr;
   auto const d_strings     = column_device_view::create(strings.parent(), stream);
   thrust::transform(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     begin,
     end,
     d_out_offsets,
@@ -318,7 +317,7 @@ std::unique_ptr<cudf::column> gather(
 
   // check total size is not too large
   size_t const total_bytes = thrust::transform_reduce(
-    rmm::exec_policy(stream),
+    rmm::exec_policy_nosync(stream),
     d_out_offsets,
     d_out_offsets + output_count,
     [] __device__(auto size) { return static_cast<size_t>(size); },
@@ -328,8 +327,10 @@ std::unique_ptr<cudf::column> gather(
                "total size of output strings is too large for a cudf column");
 
   // In-place convert output sizes into offsets
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), d_out_offsets, d_out_offsets + output_count + 1, d_out_offsets);
+  thrust::exclusive_scan(rmm::exec_policy_nosync(stream),
+                         d_out_offsets,
+                         d_out_offsets + output_count + 1,
+                         d_out_offsets);
 
   // build chars column
   cudf::device_span<int32_t const> const d_out_offsets_span(d_out_offsets, output_count + 1);
@@ -372,13 +373,12 @@ std::unique_ptr<cudf::column> gather(
  * @return New strings column containing the gathered strings.
  */
 template <typename MapIterator>
-std::unique_ptr<cudf::column> gather(
-  strings_column_view const& strings,
-  MapIterator begin,
-  MapIterator end,
-  bool nullify_out_of_bounds,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::column> gather(strings_column_view const& strings,
+                                     MapIterator begin,
+                                     MapIterator end,
+                                     bool nullify_out_of_bounds,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   if (nullify_out_of_bounds) return gather<true>(strings, begin, end, stream, mr);
   return gather<false>(strings, begin, end, stream, mr);
diff --git a/cpp/include/cudf/strings/detail/json.hpp b/cpp/include/cudf/strings/detail/json.hpp
index 90188910c7d..0fb06d36570 100644
--- a/cpp/include/cudf/strings/detail/json.hpp
+++ b/cpp/include/cudf/strings/detail/json.hpp
@@ -16,6 +16,8 @@
 
 #pragma once
 
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/json.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -30,12 +32,11 @@ namespace detail {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches
  */
-std::unique_ptr<cudf::column> get_json_object(
-  cudf::strings_column_view const& col,
-  cudf::string_scalar const& json_path,
-  get_json_object_options options,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& col,
+                                              cudf::string_scalar const& json_path,
+                                              cudf::strings::get_json_object_options options,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/pad_impl.cuh b/cpp/include/cudf/strings/detail/pad_impl.cuh
new file mode 100644
index 00000000000..648c240bfbc
--- /dev/null
+++ b/cpp/include/cudf/strings/detail/pad_impl.cuh
@@ -0,0 +1,126 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <cudf/strings/detail/utf8.hpp>
+#include <cudf/strings/detail/utilities.cuh>
+#include <cudf/strings/side_type.hpp>
+#include <cudf/strings/string_view.cuh>
+
+namespace cudf {
+namespace strings {
+namespace detail {
+
+/**
+ * @brief Return the size in bytes of padding d_str to width characters using a fill character
+ * with byte length of fill_char_size
+ *
+ * Pad does not perform truncation. That is, If `d_str.length() > width` then `d_str.size_bytes()`
+ * is returned.
+ *
+ * @param d_str String to pad
+ * @param width Number of characters for the padded string result
+ * @param fill_char_size Size of the fill character in bytes
+ * @return The number of bytes required for the pad
+ */
+__device__ size_type compute_padded_size(string_view d_str,
+                                         size_type width,
+                                         size_type fill_char_size)
+{
+  auto const length = d_str.length();
+  auto bytes        = d_str.size_bytes();
+  if (width > length)                            // no truncating;
+    bytes += fill_char_size * (width - length);  // add padding
+  return bytes;
+}
+
+/**
+ * @brief Pad d_str with fill_char into output up to width characters
+ *
+ * Pad does not perform truncation. That is, If `d_str.length() > width` then
+ * then d_str is copied into output.
+ *
+ * @tparam side Specifies where fill_char is added to d_str
+ * @param d_str String to pad
+ * @param width Number of characters for the padded string result
+ * @param fill_char Size of the fill character in bytes
+ * @param output Device memory to copy the padded string into
+ */
+template <side_type side = side_type::RIGHT>
+__device__ void pad_impl(cudf::string_view d_str,
+                         cudf::size_type width,
+                         cudf::char_utf8 fill_char,
+                         char* output)
+{
+  auto length = d_str.length();
+  if constexpr (side == side_type::LEFT) {
+    while (length++ < width) {
+      output += from_char_utf8(fill_char, output);
+    }
+    copy_string(output, d_str);
+  }
+  if constexpr (side == side_type::RIGHT) {
+    output = copy_string(output, d_str);
+    while (length++ < width) {
+      output += from_char_utf8(fill_char, output);
+    }
+  }
+  if constexpr (side == side_type::BOTH) {
+    auto const pad_size = width - length;
+    // an odd width will right-justify
+    auto right_pad = (width % 2) ? pad_size / 2 : (pad_size - pad_size / 2);
+    auto left_pad  = pad_size - right_pad;  // e.g. width=7: "++foxx+"; width=6: "+fox++"
+    while (left_pad-- > 0) {
+      output += from_char_utf8(fill_char, output);
+    }
+    output = copy_string(output, d_str);
+    while (right_pad-- > 0) {
+      output += from_char_utf8(fill_char, output);
+    }
+  }
+}
+
+/**
+ * @brief Prepend d_str with '0' into output up to width characters
+ *
+ * Pad does not perform truncation. That is, If `d_str.length() > width` then
+ * then d_str is copied into output.
+ *
+ * If d_str starts with a sign character ('-' or '+') then '0' padding
+ * starts after the sign.
+ *
+ * @param d_str String to pad
+ * @param width Number of characters for the padded string result
+ * @param output Device memory to copy the padded string into
+ */
+__device__ void zfill_impl(cudf::string_view d_str, cudf::size_type width, char* output)
+{
+  auto length = d_str.length();
+  auto in_ptr = d_str.data();
+  // if the string starts with a sign, output the sign first
+  if (!d_str.empty() && (*in_ptr == '-' || *in_ptr == '+')) {
+    *output++ = *in_ptr++;
+    d_str     = cudf::string_view{in_ptr, d_str.size_bytes() - 1};
+  }
+  while (length++ < width)
+    *output++ = '0';  // prepend zero char
+  copy_string(output, d_str);
+}
+
+}  // namespace detail
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/detail/replace.hpp b/cpp/include/cudf/strings/detail/replace.hpp
index ce1d5e8a925..aa6fb2feb3d 100644
--- a/cpp/include/cudf/strings/detail/replace.hpp
+++ b/cpp/include/cudf/strings/detail/replace.hpp
@@ -43,13 +43,12 @@ enum class replace_algorithm {
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
 template <replace_algorithm alg = replace_algorithm::AUTO>
-std::unique_ptr<column> replace(
-  strings_column_view const& strings,
-  string_scalar const& target,
-  string_scalar const& repl,
-  int32_t maxrepl                     = -1,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace(strings_column_view const& strings,
+                                string_scalar const& target,
+                                string_scalar const& repl,
+                                int32_t maxrepl,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::strings::replace_slice(strings_column_view const&, string_scalar const&,
@@ -57,13 +56,12 @@ std::unique_ptr<column> replace(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace_slice(
-  strings_column_view const& strings,
-  string_scalar const& repl           = string_scalar(""),
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_slice(strings_column_view const& strings,
+                                      string_scalar const& repl,
+                                      size_type start,
+                                      size_type stop,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc cudf::strings::replace(strings_column_view const&, strings_column_view const&,
@@ -71,12 +69,11 @@ std::unique_ptr<column> replace_slice(
  *
  * @param[in] stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> replace(
-  strings_column_view const& strings,
-  strings_column_view const& targets,
-  strings_column_view const& repls,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace(strings_column_view const& strings,
+                                strings_column_view const& targets,
+                                strings_column_view const& repls,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Replaces any null string entries with the given string.
@@ -96,11 +93,10 @@ std::unique_ptr<column> replace(
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings column.
  */
-std::unique_ptr<column> replace_nulls(
-  strings_column_view const& strings,
-  string_scalar const& repl           = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> replace_nulls(strings_column_view const& strings,
+                                      string_scalar const& repl,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/detail/scatter.cuh b/cpp/include/cudf/strings/detail/scatter.cuh
index d430f390f10..55dd5bda260 100644
--- a/cpp/include/cudf/strings/detail/scatter.cuh
+++ b/cpp/include/cudf/strings/detail/scatter.cuh
@@ -57,18 +57,18 @@ namespace detail {
  * @return New strings column.
  */
 template <typename SourceIterator, typename MapIterator>
-std::unique_ptr<column> scatter(
-  SourceIterator begin,
-  SourceIterator end,
-  MapIterator scatter_map,
-  strings_column_view const& target,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> scatter(SourceIterator begin,
+                                SourceIterator end,
+                                MapIterator scatter_map,
+                                strings_column_view const& target,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   if (target.is_empty()) return make_empty_column(type_id::STRING);
 
   // create vector of string_view's to scatter into
-  rmm::device_uvector<string_view> target_vector = create_string_vector_from_column(target, stream);
+  rmm::device_uvector<string_view> target_vector =
+    create_string_vector_from_column(target, stream, rmm::mr::get_current_device_resource());
 
   // this ensures empty strings are not mapped to nulls in the make_strings_column function
   auto const size = thrust::distance(begin, end);
@@ -76,7 +76,8 @@ std::unique_ptr<column> scatter(
     begin, [] __device__(string_view const sv) { return sv.empty() ? string_view{} : sv; });
 
   // do the scatter
-  thrust::scatter(rmm::exec_policy(stream), itr, itr + size, scatter_map, target_vector.begin());
+  thrust::scatter(
+    rmm::exec_policy_nosync(stream), itr, itr + size, scatter_map, target_vector.begin());
 
   // build the output column
   auto sv_span = cudf::device_span<string_view const>(target_vector);
diff --git a/cpp/include/cudf/strings/detail/utilities.cuh b/cpp/include/cudf/strings/detail/utilities.cuh
index 592f2128d0e..76e5f931981 100644
--- a/cpp/include/cudf/strings/detail/utilities.cuh
+++ b/cpp/include/cudf/strings/detail/utilities.cuh
@@ -50,11 +50,10 @@ namespace detail {
  * @return offsets child column for strings column
  */
 template <typename InputIterator>
-std::unique_ptr<column> make_offsets_child_column(
-  InputIterator begin,
-  InputIterator end,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> make_offsets_child_column(InputIterator begin,
+                                                  InputIterator end,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(begin < end, "Invalid iterator range");
   auto count = thrust::distance(begin, end);
@@ -117,12 +116,11 @@ __device__ inline char* copy_string(char* buffer, const string_view& d_string)
  * @return offsets child column and chars child column for a strings column
  */
 template <typename SizeAndExecuteFunction>
-auto make_strings_children(
-  SizeAndExecuteFunction size_and_exec_fn,
-  size_type exec_size,
-  size_type strings_count,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type exec_size,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr)
 {
   auto offsets_column = make_numeric_column(
     data_type{type_id::INT32}, strings_count + 1, mask_state::UNALLOCATED, stream, mr);
@@ -175,11 +173,10 @@ auto make_strings_children(
  * @return offsets child column and chars child column for a strings column
  */
 template <typename SizeAndExecuteFunction>
-auto make_strings_children(
-  SizeAndExecuteFunction size_and_exec_fn,
-  size_type strings_count,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+auto make_strings_children(SizeAndExecuteFunction size_and_exec_fn,
+                           size_type strings_count,
+                           rmm::cuda_stream_view stream,
+                           rmm::mr::device_memory_resource* mr)
 {
   return make_strings_children(size_and_exec_fn, strings_count, strings_count, stream, mr);
 }
diff --git a/cpp/include/cudf/strings/detail/utilities.hpp b/cpp/include/cudf/strings/detail/utilities.hpp
index ceae93dfe84..41a2654dce3 100644
--- a/cpp/include/cudf/strings/detail/utilities.hpp
+++ b/cpp/include/cudf/strings/detail/utilities.hpp
@@ -36,10 +36,9 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return The chars child column for a strings column.
  */
-std::unique_ptr<column> create_chars_child_column(
-  size_type bytes,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> create_chars_child_column(size_type bytes,
+                                                  rmm::cuda_stream_view stream,
+                                                  rmm::mr::device_memory_resource* mr);
 
 /**
  * @brief Creates a string_view vector from a strings column.
@@ -51,8 +50,8 @@ std::unique_ptr<column> create_chars_child_column(
  */
 rmm::device_uvector<string_view> create_string_vector_from_column(
   cudf::strings_column_view const strings,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/include/cudf/strings/extract.hpp b/cpp/include/cudf/strings/extract.hpp
index a30098bedb9..a80d971438d 100644
--- a/cpp/include/cudf/strings/extract.hpp
+++ b/cpp/include/cudf/strings/extract.hpp
@@ -23,6 +23,9 @@
 
 namespace cudf {
 namespace strings {
+
+struct regex_program;
+
 /**
  * @addtogroup strings_substring
  * @{
@@ -61,6 +64,37 @@ std::unique_ptr<table> extract(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a table of strings columns where each column corresponds to the matching
+ * group specified in the given regex_program object
+ *
+ * All the strings for the first group will go in the first output column; the second group
+ * go in the second column and so on. Null entries are added to the columns in row `i` if
+ * the string at row `i` does not match.
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["a1", "b2", "c3"]
+ * p = regex_program::create("([ab])(\\d)")
+ * r = extract(s, p)
+ * r is now [ ["a", "b", null],
+ *            ["1", "2", null] ]
+ * @endcode
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate the returned table's device memory
+ * @return Columns of strings extracted from the input column
+ */
+std::unique_ptr<table> extract(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Returns a lists column of strings where each string column row corresponds to the
  * matching group specified in the given regular expression pattern.
@@ -96,6 +130,40 @@ std::unique_ptr<column> extract_all_record(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a lists column of strings where each string column row corresponds to the
+ * matching group specified in the given regex_program object
+ *
+ * All the matching groups for the first row will go in the first row output column; the second
+ * row results will go into the second row output column and so on.
+ *
+ * A null output row will result if the corresponding input string row does not match or
+ * that input row is null.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["a1 b4", "b2", "c3 a5", "b", null]
+ * p = regex_program::create("([ab])(\\d)")
+ * r = extract_all_record(s, p)
+ * r is now [ ["a", "1", "b", "4"],
+ *            ["b", "2"],
+ *            ["a", "5"],
+ *            null,
+ *            null ]
+ * @endcode
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate any returned device memory
+ * @return Lists column containing strings extracted from the input column
+ */
+std::unique_ptr<column> extract_all_record(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/findall.hpp b/cpp/include/cudf/strings/findall.hpp
index 6969ba35b1b..366e1eb0482 100644
--- a/cpp/include/cudf/strings/findall.hpp
+++ b/cpp/include/cudf/strings/findall.hpp
@@ -23,6 +23,9 @@
 
 namespace cudf {
 namespace strings {
+
+struct regex_program;
+
 /**
  * @addtogroup strings_contains
  * @{
@@ -63,6 +66,39 @@ std::unique_ptr<column> findall(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Returns a lists column of strings for each matching occurrence using
+ * the regex_program pattern within each string
+ *
+ * Each output row includes all the substrings within the corresponding input row
+ * that match the given pattern. If no matches are found, the output row is empty.
+ *
+ * @code{.pseudo}
+ * Example:
+ * s = ["bunny", "rabbit", "hare", "dog"]
+ * p = regex_program::create("[ab]")
+ * r = findall(s, p)
+ * r is now a lists column like:
+ *  [ ["b"]
+ *    ["a","b","b"]
+ *    ["a"]
+ *    [] ]
+ * @endcode
+ *
+ * A null output row occurs if the corresponding input row is null.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param input Strings instance for this operation
+ * @param prog Regex program instance
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New lists column of strings
+ */
+std::unique_ptr<column> findall(
+  strings_column_view const& input,
+  regex_program const& prog,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/regex/flags.hpp b/cpp/include/cudf/strings/regex/flags.hpp
index 3a7051345fa..44ca68439e7 100644
--- a/cpp/include/cudf/strings/regex/flags.hpp
+++ b/cpp/include/cudf/strings/regex/flags.hpp
@@ -21,7 +21,7 @@ namespace cudf {
 namespace strings {
 
 /**
- * @addtogroup strings_contains
+ * @addtogroup strings_regex
  * @{
  */
 
diff --git a/cpp/include/cudf/strings/regex/regex_program.hpp b/cpp/include/cudf/strings/regex/regex_program.hpp
new file mode 100644
index 00000000000..2b606393719
--- /dev/null
+++ b/cpp/include/cudf/strings/regex/regex_program.hpp
@@ -0,0 +1,138 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/regex/flags.hpp>
+#include <cudf/types.hpp>
+
+#include <memory>
+#include <string>
+
+namespace cudf {
+namespace strings {
+
+/**
+ * @addtogroup strings_regex
+ * @{
+ */
+
+/**
+ * @brief Regex program class
+ *
+ * Create an instance from a regex pattern and use it to call the appropriate
+ * strings APIs. An instance can be reused.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns and APIs that support regex.
+ */
+struct regex_program {
+  struct regex_program_impl;
+
+  /**
+   * @brief Create a program from a pattern
+   *
+   * @throw cudf::logic_error If pattern is invalid or contains unsupported features
+   *
+   * @param pattern Regex pattern
+   * @param flags Regex flags for interpreting special characters in the pattern
+   * @param capture Controls how capture groups in the pattern are used
+   * @return Instance of this object
+   */
+  static std::unique_ptr<regex_program> create(std::string_view pattern,
+                                               regex_flags flags      = regex_flags::DEFAULT,
+                                               capture_groups capture = capture_groups::EXTRACT);
+
+  /**
+   * @brief Move constructor
+   *
+   * @param other Object to move from
+   */
+  regex_program(regex_program&& other);
+
+  /**
+   * @brief Move operator assignment
+   *
+   * @param other Object to move from
+   * @return this object
+   */
+  regex_program& operator=(regex_program&& other);
+
+  /**
+   * @brief Return the pattern used to create this instance
+   *
+   * @return regex pattern as a string
+   */
+  std::string pattern() const;
+
+  /**
+   * @brief Return the regex_flags used to create this instance
+   *
+   * @return regex flags setting
+   */
+  regex_flags flags() const;
+
+  /**
+   * @brief Return the capture_groups used to create this instance
+   *
+   * @return capture groups setting
+   */
+  capture_groups capture() const;
+
+  /**
+   * @brief Return the number of instructions in this instance
+   *
+   * @return Number of instructions
+   */
+  int32_t instructions_count() const;
+
+  /**
+   * @brief Return the number of capture groups in this instance
+   *
+   * @return Number of groups
+   */
+  int32_t groups_count() const;
+
+  /**
+   * @brief Return the pattern used to create this instance
+   *
+   * @param num_strings Number of strings for computation
+   * @return Size of the working memory in bytes
+   */
+  std::size_t compute_working_memory_size(int32_t num_strings) const;
+
+  ~regex_program();
+
+ private:
+  regex_program() = delete;
+
+  std::string _pattern;
+  regex_flags _flags;
+  capture_groups _capture;
+
+  std::unique_ptr<regex_program_impl> _impl;
+
+  /**
+   * @brief Constructor
+   *
+   * Called by create()
+   */
+  regex_program(std::string_view pattern, regex_flags flags, capture_groups capture);
+
+  friend struct regex_device_builder;
+};
+
+/** @} */  // end of doxygen group
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/include/cudf/strings/replace_re.hpp b/cpp/include/cudf/strings/replace_re.hpp
index d80b9a89b81..60c66956fb8 100644
--- a/cpp/include/cudf/strings/replace_re.hpp
+++ b/cpp/include/cudf/strings/replace_re.hpp
@@ -26,6 +26,9 @@
 
 namespace cudf {
 namespace strings {
+
+struct regex_program;
+
 /**
  * @addtogroup strings_replace
  * @{
@@ -58,6 +61,30 @@ std::unique_ptr<column> replace_re(
   regex_flags const flags                    = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief For each string, replaces any character sequence matching the given regex
+ * with the provided replacement string.
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param replacement The string used to replace the matched sequence in each string.
+ *        Default is an empty string.
+ * @param max_replace_count The maximum number of times to replace the matched pattern
+ *        within each string. Default replaces every substring that is matched.
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+std::unique_ptr<column> replace_re(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  string_scalar const& replacement           = string_scalar(""),
+  std::optional<size_type> max_replace_count = std::nullopt,
+  rmm::mr::device_memory_resource* mr        = rmm::mr::get_current_device_resource());
+
 /**
  * @brief For each string, replaces any character sequence matching the given patterns
  * with the corresponding string in the `replacements` column.
@@ -105,5 +132,28 @@ std::unique_ptr<column> replace_with_backrefs(
   regex_flags const flags             = regex_flags::DEFAULT,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief For each string, replaces any character sequence matching the given regex
+ * using the replacement template for back-references.
+ *
+ * Any null string entries return corresponding null output column entries.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @throw cudf::logic_error if capture index values in `replacement` are not in range 0-99, and also
+ * if the index exceeds the group count specified in the pattern
+ *
+ * @param strings Strings instance for this operation
+ * @param prog Regex program instance
+ * @param replacement The replacement template for creating the output string
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ * @return New strings column
+ */
+std::unique_ptr<column> replace_with_backrefs(
+  strings_column_view const& strings,
+  regex_program const& prog,
+  std::string_view replacement,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/split/split_re.hpp b/cpp/include/cudf/strings/split/split_re.hpp
index 6fe07b0f5dc..c6bd1345ae6 100644
--- a/cpp/include/cudf/strings/split/split_re.hpp
+++ b/cpp/include/cudf/strings/split/split_re.hpp
@@ -23,6 +23,9 @@
 
 namespace cudf {
 namespace strings {
+
+struct regex_program;
+
 /**
  * @addtogroup strings_split
  * @{
@@ -77,6 +80,58 @@ std::unique_ptr<table> split_re(
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Splits strings elements into a table of strings columns
+ * using a regex_program's pattern to delimit each string
+ *
+ * Each element generates a vector of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of strings[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of rows in the output table will be the same as the number of
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
+ *
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
+ *
+ * The regex_program's regex_flags are ignored.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * p1 = regex_program::create("[_ ]")
+ * s1 = split_re(s, p1)
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * p2 = regex_program::create("[ _]")
+ * s2 = split_re(s, p2, 1)
+ * s2 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc def_g", "_bc", "ab cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split
+ * @param prog Regex program instance
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return A table of columns of strings
+ */
+std::unique_ptr<table> split_re(
+  strings_column_view const& input,
+  regex_program const& prog,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Splits strings elements into a table of strings columns
  * using a regex pattern to delimit each string starting from the end of the string.
@@ -127,6 +182,60 @@ std::unique_ptr<table> rsplit_re(
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Splits strings elements into a table of strings columns using a
+ * regex_program's pattern to delimit each string starting from the end of the string
+ *
+ * Each element generates a vector of strings that are stored in corresponding
+ * rows in the output table -- `table[col,row] = token[col] of string[row]`
+ * where `token` is the substring between each delimiter.
+ *
+ * The number of rows in the output table will be the same as the number of
+ * elements in the input column. The resulting number of columns will be the
+ * maximum number of tokens found in any input row.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty string in the
+ * corresponding row of the first column.
+ * A null row will produce corresponding null rows in the output table.
+ *
+ * The regex_program's regex_flags are ignored.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * p1 = regex_program::create("[_ ]")
+ * s1 = rsplit_re(s, p1)
+ * s1 is a table of strings columns:
+ *     [ ["a", "a", "", "ab"],
+ *       ["bc", "", "ab", "cd"],
+ *       ["def", "bc", "cd", ""],
+ *       ["g", null, null, null] ]
+ * p2 = regex_program::create("[ _]")
+ * s2 = rsplit_re(s, p2, 1)
+ * s2 is a table of strings columns:
+ *     [ ["a_bc def", "a_", "_ab", "ab"],
+ *       ["g", "bc", "cd", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split.
+ * @param prog Regex program instance
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory.
+ * @return A table of columns of strings.
+ */
+std::unique_ptr<table> rsplit_re(
+  strings_column_view const& input,
+  regex_program const& prog,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Splits strings elements into a list column of strings
  * using the given regex pattern to delimit each string.
@@ -179,6 +288,62 @@ std::unique_ptr<column> split_record_re(
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Splits strings elements into a list column of strings
+ * using the given regex_program to delimit each string
+ *
+ * Each element generates an array of strings that are stored in an output
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * The `pattern` is used to identify the delimiters within a string
+ * and splitting stops when either `maxsplit` or the end of the string is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null output row.
+ *
+ * The regex_program's regex_flags are ignored.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * p1 = regex_program::create("[_ ]")
+ * s1 = split_record_re(s, p1)
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * p2 = regex_program::create("[ _]")
+ * s2 = split_record_re(s, p2, 1)
+ * s2 is a lists column of strings:
+ *     [ ["a", "bc def_g"],
+ *       ["a", "_bc"],
+ *       ["", "ab cd"],
+ *       ["ab", "cd "] ]
+ * @endcode
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @param input A column of string elements to be split
+ * @param prog Regex program instance
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return Lists column of strings.
+ */
+std::unique_ptr<column> split_record_re(
+  strings_column_view const& input,
+  regex_program const& prog,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /**
  * @brief Splits strings elements into a list column of strings
  * using the given regex pattern to delimit each string starting from the end of the string.
@@ -233,6 +398,64 @@ std::unique_ptr<column> rsplit_record_re(
   size_type maxsplit                  = -1,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
+/**
+ * @brief Splits strings elements into a list column of strings using the given
+ * regex_program to delimit each string starting from the end of the string
+ *
+ * Each element generates a vector of strings that are stored in an output
+ * lists column -- `list[row] = [token1, token2, ...] found in input[row]`
+ * where `token` is a substring between delimiters.
+ *
+ * The number of elements in the output column will be the same as the number of
+ * elements in the input column. Each individual list item will contain the
+ * new strings for that row. The resulting number of strings in each row can vary
+ * from 0 to `maxsplit + 1`.
+ *
+ * Splitting occurs by traversing starting from the end of the input string.
+ * The `pattern` is used to identify the separation points within a string
+ * and splitting stops when either `maxsplit` or the beginning of the string
+ * is reached.
+ *
+ * An empty input string will produce a corresponding empty list item output row.
+ * A null row will produce a corresponding null output row.
+ *
+ * The regex_program's regex_flags are ignored.
+ *
+ * @code{.pseudo}
+ * s = ["a_bc def_g", "a__bc", "_ab cd", "ab_cd "]
+ * p1 = regex_program::create("[_ ]")
+ * s1 = rsplit_record_re(s, p1)
+ * s1 is a lists column of strings:
+ *     [ ["a", "bc", "def", "g"],
+ *       ["a", "", "bc"],
+ *       ["", "ab", "cd"],
+ *       ["ab", "cd", ""] ]
+ * p2 = regex_program::create("[ _]")
+ * s2 = rsplit_record_re(s, p2, 1)
+ * s2 is a lists column of strings:
+ *     [ ["a_bc def", "g"],
+ *       ["a_", "bc"],
+ *       ["_ab", "cd"],
+ *       ["ab_cd", ""] ]
+ * @endcode
+ *
+ * See the @ref md_regex "Regex Features" page for details on patterns supported by this API.
+ *
+ * @throw cudf::logic_error if `pattern` is empty.
+ *
+ * @param input A column of string elements to be split
+ * @param prog Regex program instance
+ * @param maxsplit Maximum number of splits to perform.
+ *        Default of -1 indicates all possible splits on each string.
+ * @param mr Device memory resource used to allocate the returned result's device memory
+ * @return Lists column of strings
+ */
+std::unique_ptr<column> rsplit_record_re(
+  strings_column_view const& input,
+  regex_program const& prog,
+  size_type maxsplit                  = -1,
+  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+
 /** @} */  // end of doxygen group
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/include/cudf/strings/string_view.hpp b/cpp/include/cudf/strings/string_view.hpp
index 03bf538b1b2..265adc60392 100644
--- a/cpp/include/cudf/strings/string_view.hpp
+++ b/cpp/include/cudf/strings/string_view.hpp
@@ -17,6 +17,8 @@
 
 #include <cudf/types.hpp>
 
+#include <cuda_runtime.h>
+
 #include <iterator>
 
 /**
diff --git a/cpp/include/cudf/table/experimental/row_operators.cuh b/cpp/include/cudf/table/experimental/row_operators.cuh
index af7091fc00c..e7b0c6eb6b6 100644
--- a/cpp/include/cudf/table/experimental/row_operators.cuh
+++ b/cpp/include/cudf/table/experimental/row_operators.cuh
@@ -820,7 +820,7 @@ class self_comparator {
   self_comparator(table_view const& t,
                   host_span<order const> column_order         = {},
                   host_span<null_order const> null_precedence = {},
-                  rmm::cuda_stream_view stream                = cudf::default_stream_value)
+                  rmm::cuda_stream_view stream                = cudf::get_default_stream())
     : d_t{preprocessed_table::create(t, column_order, null_precedence, stream)}
   {
   }
@@ -962,7 +962,7 @@ class two_table_comparator {
                        table_view const& right,
                        host_span<order const> column_order         = {},
                        host_span<null_order const> null_precedence = {},
-                       rmm::cuda_stream_view stream                = cudf::default_stream_value);
+                       rmm::cuda_stream_view stream                = cudf::get_default_stream());
 
   /**
    * @brief Construct an owning object for performing a lexicographic comparison between two rows of
diff --git a/cpp/include/cudf/table/table.hpp b/cpp/include/cudf/table/table.hpp
index 3b803c2b949..6d11ed0bfad 100644
--- a/cpp/include/cudf/table/table.hpp
+++ b/cpp/include/cudf/table/table.hpp
@@ -69,7 +69,7 @@ class table {
    * @param mr Device memory resource used for allocating the device memory for the new columns
    */
   table(table_view view,
-        rmm::cuda_stream_view stream        = cudf::default_stream_value,
+        rmm::cuda_stream_view stream        = cudf::get_default_stream(),
         rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
diff --git a/cpp/include/cudf/table/table_device_view.cuh b/cpp/include/cudf/table/table_device_view.cuh
index 9f6930b57f5..511013b585d 100644
--- a/cpp/include/cudf/table/table_device_view.cuh
+++ b/cpp/include/cudf/table/table_device_view.cuh
@@ -175,7 +175,7 @@ class table_device_view : public detail::table_device_view_base<column_device_vi
    * available in device memory
    */
   static auto create(table_view source_view,
-                     rmm::cuda_stream_view stream = cudf::default_stream_value)
+                     rmm::cuda_stream_view stream = cudf::get_default_stream())
   {
     auto deleter = [](table_device_view* t) { t->destroy(); };
     return std::unique_ptr<table_device_view, decltype(deleter)>{
@@ -212,7 +212,7 @@ class mutable_table_device_view
    * available in device memory
    */
   static auto create(mutable_table_view source_view,
-                     rmm::cuda_stream_view stream = cudf::default_stream_value)
+                     rmm::cuda_stream_view stream = cudf::get_default_stream())
   {
     auto deleter = [](mutable_table_device_view* t) { t->destroy(); };
     return std::unique_ptr<mutable_table_device_view, decltype(deleter)>{
diff --git a/cpp/include/cudf/tdigest/tdigest_column_view.cuh b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
similarity index 83%
rename from cpp/include/cudf/tdigest/tdigest_column_view.cuh
rename to cpp/include/cudf/tdigest/tdigest_column_view.hpp
index 64371fd5c45..c63e2b16326 100644
--- a/cpp/include/cudf/tdigest/tdigest_column_view.cuh
+++ b/cpp/include/cudf/tdigest/tdigest_column_view.hpp
@@ -16,30 +16,11 @@
 #pragma once
 
 #include <cudf/column/column_view.hpp>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/lists/lists_column_view.hpp>
 
 namespace cudf {
 namespace tdigest {
 
-/**
- * @brief Functor to compute the size of each tdigest of a column.
- *
- */
-struct tdigest_size {
-  size_type const* offsets;  ///< Offsets of the t-digest column
-  /**
-   * @brief Returns size of the each tdigest in the column
-   *
-   * @param tdigest_index Index of the tdigest in the column
-   * @return Size of the tdigest
-   */
-  __device__ size_type operator()(size_type tdigest_index)
-  {
-    return offsets[tdigest_index + 1] - offsets[tdigest_index];
-  }
-};
-
 /**
  * @brief Given a column_view containing tdigest data, an instance of this class
  * provides a wrapper on the compound column for tdigest operations.
@@ -127,18 +108,6 @@ class tdigest_column_view : private column_view {
    */
   [[nodiscard]] column_view weights() const;
 
-  /**
-   * @brief Returns an iterator that returns the size of each tdigest
-   * in the column (each row is 1 digest)
-   *
-   * @return An iterator that returns the size of each tdigest in the column
-   */
-  [[nodiscard]] auto size_begin() const
-  {
-    return cudf::detail::make_counting_transform_iterator(
-      0, tdigest_size{centroids().offsets_begin()});
-  }
-
   /**
    * @brief Returns the first min value for the column. Each row corresponds
    * to the minimum value for the accompanying digest.
diff --git a/cpp/include/cudf/utilities/default_stream.hpp b/cpp/include/cudf/utilities/default_stream.hpp
index 94bc01787e3..1eec3b994d0 100644
--- a/cpp/include/cudf/utilities/default_stream.hpp
+++ b/cpp/include/cudf/utilities/default_stream.hpp
@@ -16,21 +16,19 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/default_stream.hpp>
+
+#include <rmm/cuda_stream.hpp>
 #include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 
 /**
- * @brief Default stream for cudf
+ * @brief Get the current default stream
  *
- * Use this value to ensure the correct stream is used when compiled with per
- * thread default stream.
+ * @return The current default stream.
  */
-#if defined(CUDF_USE_PER_THREAD_DEFAULT_STREAM)
-static const rmm::cuda_stream_view default_stream_value{rmm::cuda_stream_per_thread};
-#else
-static constexpr rmm::cuda_stream_view default_stream_value{};
-#endif
+rmm::cuda_stream_view const get_default_stream();
 
 /**
  * @brief Check if per-thread default stream is enabled.
diff --git a/cpp/include/cudf/utilities/span.hpp b/cpp/include/cudf/utilities/span.hpp
index dcb9786bbd2..074e8d25bf7 100644
--- a/cpp/include/cudf/utilities/span.hpp
+++ b/cpp/include/cudf/utilities/span.hpp
@@ -226,7 +226,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
                      std::is_convertible_v<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
                                              std::declval<C&>().data()))> (*)[],
                                            T (*)[]>>* = nullptr>
-  constexpr host_span(C& in) : base(in.data(), in.size())
+  constexpr host_span(C& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
   {
   }
 
@@ -239,7 +239,7 @@ struct host_span : public cudf::detail::span_base<T, Extent, host_span<T, Extent
                      std::is_convertible_v<std::remove_pointer_t<decltype(thrust::raw_pointer_cast(
                                              std::declval<C&>().data()))> (*)[],
                                            T (*)[]>>* = nullptr>
-  constexpr host_span(C const& in) : base(in.data(), in.size())
+  constexpr host_span(C const& in) : base(thrust::raw_pointer_cast(in.data()), in.size())
   {
   }
 
diff --git a/cpp/include/cudf/utilities/traits.hpp b/cpp/include/cudf/utilities/traits.hpp
index d95ea42a039..43d43ba6bb3 100644
--- a/cpp/include/cudf/utilities/traits.hpp
+++ b/cpp/include/cudf/utilities/traits.hpp
@@ -261,6 +261,30 @@ constexpr inline bool is_signed_iterator()
   return std::is_signed_v<typename std::iterator_traits<Iterator>::value_type>;
 }
 
+/**
+ * @brief Indicates whether the type `T` is an integral type.
+ *
+ * @tparam T  The type to verify
+ * @return true `T` is integral
+ * @return false  `T` is not integral
+ */
+template <typename T>
+constexpr inline bool is_integral()
+{
+  return cuda::std::is_integral_v<T>;
+}
+
+/**
+ * @brief Indicates whether `type` is a integral `data_type`.
+ *
+ * "Integral" types are fundamental integer types such as `INT*` and `UINT*`.
+ *
+ * @param type The `data_type` to verify
+ * @return true `type` is integral
+ * @return false `type` is integral
+ */
+bool is_integral(data_type type);
+
 /**
  * @brief Indicates whether the type `T` is a floating point type.
  *
diff --git a/cpp/include/cudf_test/base_fixture.hpp b/cpp/include/cudf_test/base_fixture.hpp
index e529785a758..be4d5bccd7b 100644
--- a/cpp/include/cudf_test/base_fixture.hpp
+++ b/cpp/include/cudf_test/base_fixture.hpp
@@ -18,12 +18,15 @@
 
 #include <random>
 
+#include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/cxxopts.hpp>
 #include <cudf_test/file_utilities.hpp>
+#include <cudf_test/stream_checking_resource_adapter.hpp>
 
+#include <rmm/cuda_stream_view.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
 #include <rmm/mr/device/binning_memory_resource.hpp>
 #include <rmm/mr/device/cuda_async_memory_resource.hpp>
@@ -303,11 +306,18 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
   try {
     cxxopts::Options options(argv[0], " - cuDF tests command line options");
     const char* env_rmm_mode = std::getenv("GTEST_CUDF_RMM_MODE");  // Overridden by CLI options
+    const char* env_stream_mode =
+      std::getenv("GTEST_CUDF_STREAM_MODE");  // Overridden by CLI options
     auto default_rmm_mode    = env_rmm_mode ? env_rmm_mode : "pool";
+    auto default_stream_mode = env_stream_mode ? env_stream_mode : "default";
     options.allow_unrecognised_options().add_options()(
       "rmm_mode",
       "RMM allocation mode",
       cxxopts::value<std::string>()->default_value(default_rmm_mode));
+    options.allow_unrecognised_options().add_options()(
+      "stream_mode",
+      "Whether to use a non-default stream",
+      cxxopts::value<std::string>()->default_value(default_stream_mode));
     return options.parse(argc, argv);
   } catch (const cxxopts::OptionException& e) {
     CUDF_FAIL("Error parsing command line options");
@@ -324,13 +334,21 @@ inline auto parse_cudf_test_opts(int argc, char** argv)
  * function parses the command line to customize test behavior, like the
  * allocation mode used for creating the default memory resource.
  */
-#define CUDF_TEST_PROGRAM_MAIN()                                        \
-  int main(int argc, char** argv)                                       \
-  {                                                                     \
-    ::testing::InitGoogleTest(&argc, argv);                             \
-    auto const cmd_opts = parse_cudf_test_opts(argc, argv);             \
-    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();       \
-    auto resource       = cudf::test::create_memory_resource(rmm_mode); \
-    rmm::mr::set_current_device_resource(resource.get());               \
-    return RUN_ALL_TESTS();                                             \
+#define CUDF_TEST_PROGRAM_MAIN()                                            \
+  int main(int argc, char** argv)                                           \
+  {                                                                         \
+    ::testing::InitGoogleTest(&argc, argv);                                 \
+    auto const cmd_opts = parse_cudf_test_opts(argc, argv);                 \
+    auto const rmm_mode = cmd_opts["rmm_mode"].as<std::string>();           \
+    auto resource       = cudf::test::create_memory_resource(rmm_mode);     \
+    rmm::mr::set_current_device_resource(resource.get());                   \
+                                                                            \
+    auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();     \
+    rmm::cuda_stream const new_default_stream{};                            \
+    if (stream_mode == "custom") {                                          \
+      auto adapter = make_stream_checking_resource_adaptor(resource.get()); \
+      rmm::mr::set_current_device_resource(&adapter);                       \
+    }                                                                       \
+                                                                            \
+    return RUN_ALL_TESTS();                                                 \
   }
diff --git a/cpp/include/cudf_test/column_utilities.hpp b/cpp/include/cudf_test/column_utilities.hpp
index d41ea530402..2cc90743912 100644
--- a/cpp/include/cudf_test/column_utilities.hpp
+++ b/cpp/include/cudf_test/column_utilities.hpp
@@ -107,6 +107,13 @@ bool expect_columns_equivalent(cudf::column_view const& lhs,
                                debug_output_level verbosity = debug_output_level::FIRST_ERROR,
                                size_type fp_ulps            = cudf::test::default_ulp);
 
+/**
+ * @brief Verifies the given column is empty
+ *
+ * @param col The column to check
+ */
+void expect_column_empty(cudf::column_view const& col);
+
 /**
  * @brief Verifies the bitwise equality of two device memory buffers.
  *
@@ -234,11 +241,11 @@ inline std::pair<thrust::host_vector<std::string>, std::vector<bitmask_type>> to
   auto const scv     = strings_column_view(c);
   auto const h_chars = cudf::detail::make_std_vector_sync<char>(
     cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
-    cudf::default_stream_value);
+    cudf::get_default_stream());
   auto const h_offsets = cudf::detail::make_std_vector_sync(
     cudf::device_span<cudf::offset_type const>(
       scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
-    cudf::default_stream_value);
+    cudf::get_default_stream());
 
   // build std::string vector from chars and offsets
   std::vector<std::string> host_data;
diff --git a/cpp/include/cudf_test/column_wrapper.hpp b/cpp/include/cudf_test/column_wrapper.hpp
index 8827372b3fd..91773b2c3f1 100644
--- a/cpp/include/cudf_test/column_wrapper.hpp
+++ b/cpp/include/cudf_test/column_wrapper.hpp
@@ -170,7 +170,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   auto transform_begin = thrust::make_transform_iterator(begin, transformer);
   auto const size      = cudf::distance(begin, end);
   auto const elements  = thrust::host_vector<ElementTo>(transform_begin, transform_begin + size);
-  return rmm::device_buffer{elements.data(), size * sizeof(ElementTo), cudf::default_stream_value};
+  return rmm::device_buffer{elements.data(), size * sizeof(ElementTo), cudf::get_default_stream()};
 }
 
 /**
@@ -196,7 +196,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   auto transform_begin = thrust::make_transform_iterator(begin, transformer);
   auto const size      = cudf::distance(begin, end);
   auto const elements  = thrust::host_vector<RepType>(transform_begin, transform_begin + size);
-  return rmm::device_buffer{elements.data(), size * sizeof(RepType), cudf::default_stream_value};
+  return rmm::device_buffer{elements.data(), size * sizeof(RepType), cudf::get_default_stream()};
 }
 
 /**
@@ -223,7 +223,7 @@ rmm::device_buffer make_elements(InputIterator begin, InputIterator end)
   auto transformer_begin = thrust::make_transform_iterator(begin, to_rep);
   auto const size        = cudf::distance(begin, end);
   auto const elements = thrust::host_vector<RepType>(transformer_begin, transformer_begin + size);
-  return rmm::device_buffer{elements.data(), size * sizeof(RepType), cudf::default_stream_value};
+  return rmm::device_buffer{elements.data(), size * sizeof(RepType), cudf::get_default_stream()};
 }
 
 /**
@@ -271,7 +271,7 @@ rmm::device_buffer make_null_mask(ValidityIterator begin, ValidityIterator end)
   auto null_mask = make_null_mask_vector(begin, end);
   return rmm::device_buffer{null_mask.data(),
                             null_mask.size() * sizeof(decltype(null_mask.front())),
-                            cudf::default_stream_value};
+                            cudf::get_default_stream()};
 }
 
 /**
@@ -547,7 +547,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
     wrapped.reset(new cudf::column{
       data_type,
       size,
-      rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::default_stream_value}});
+      rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()}});
   }
 
   /**
@@ -611,7 +611,7 @@ class fixed_point_column_wrapper : public detail::column_wrapper {
     wrapped.reset(new cudf::column{
       data_type,
       size,
-      rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::default_stream_value},
+      rmm::device_buffer{elements.data(), size * sizeof(Rep), cudf::get_default_stream()},
       detail::make_null_mask(v, v + size),
       cudf::UNKNOWN_NULL_COUNT});
   }
@@ -732,9 +732,9 @@ class strings_column_wrapper : public detail::column_wrapper {
   {
     auto all_valid        = thrust::make_constant_iterator(true);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, all_valid);
-    auto d_chars          = cudf::detail::make_device_uvector_sync(chars);
-    auto d_offsets        = cudf::detail::make_device_uvector_sync(offsets);
-    wrapped               = cudf::make_strings_column(d_chars, d_offsets);
+    auto d_chars   = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
+    auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
+    wrapped        = cudf::make_strings_column(d_chars, d_offsets);
   }
 
   /**
@@ -772,10 +772,10 @@ class strings_column_wrapper : public detail::column_wrapper {
     size_type num_strings = std::distance(begin, end);
     auto [chars, offsets] = detail::make_chars_and_offsets(begin, end, v);
     auto null_mask        = detail::make_null_mask_vector(v, v + num_strings);
-    auto d_chars          = cudf::detail::make_device_uvector_sync(chars);
-    auto d_offsets        = cudf::detail::make_device_uvector_sync(offsets);
-    auto d_bitmask        = cudf::detail::make_device_uvector_sync(null_mask);
-    wrapped               = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
+    auto d_chars   = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
+    auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
+    auto d_bitmask = cudf::detail::make_device_uvector_sync(null_mask, cudf::get_default_stream());
+    wrapped        = cudf::make_strings_column(d_chars, d_offsets, d_bitmask);
   }
 
   /**
diff --git a/cpp/include/cudf_test/cudf_gtest.hpp b/cpp/include/cudf_test/cudf_gtest.hpp
index fb2680545d3..ab45d90f2d2 100644
--- a/cpp/include/cudf_test/cudf_gtest.hpp
+++ b/cpp/include/cudf_test/cudf_gtest.hpp
@@ -110,58 +110,6 @@ struct TypeList<Types<TYPES...>> {
  */
 #define EXPECT_CUDA_SUCCEEDED(expr) EXPECT_EQ(cudaSuccess, expr)
 
-/**
- * @brief Utility for testing the expectation that an expression x throws the specified
- * exception whose what() message ends with the msg
- *
- * @param x The expression to test
- * @param exception The exception type to test for
- * @param startswith The start of the expected message
- * @param endswith The end of the expected message
- */
-#define EXPECT_THROW_MESSAGE(x, exception, startswith, endswith)    \
-  do {                                                              \
-    EXPECT_THROW(                                                   \
-      {                                                             \
-        try {                                                       \
-          x;                                                        \
-        } catch (const exception& e) {                              \
-          ASSERT_NE(nullptr, e.what());                             \
-          EXPECT_THAT(e.what(), testing::StartsWith((startswith))); \
-          EXPECT_THAT(e.what(), testing::EndsWith((endswith)));     \
-          throw;                                                    \
-        }                                                           \
-      },                                                            \
-      exception);                                                   \
-  } while (0)
-
-/**
- * @brief test macro to be expected to throw cudf::logic_error with a message
- *
- * @param x The statement to be tested
- * @param msg The message associated with the exception
- */
-#define CUDF_EXPECT_THROW_MESSAGE(x, msg) \
-  EXPECT_THROW_MESSAGE(x, cudf::logic_error, "cuDF failure at:", msg)
-
-/**
- * @brief test macro to be expected to throw cudf::cuda_error with a message
- *
- * @param x The statement to be tested
- * @param msg The message associated with the exception
- */
-#define CUDA_EXPECT_THROW_MESSAGE(x, msg) \
-  EXPECT_THROW_MESSAGE(x, cudf::cuda_error, "CUDA error encountered at:", msg)
-
-/**
- * @brief test macro to be expected to throw cudf::fatal_logic_error with a message
- *
- * @param x The statement to be tested
- * @param msg The message associated with the exception
- */
-#define FATAL_CUDA_EXPECT_THROW_MESSAGE(x, msg) \
-  EXPECT_THROW_MESSAGE(x, cudf::fatal_cuda_error, "Fatal CUDA error encountered at:", msg)
-
 /**
  * @brief test macro to be expected as no exception.
  *
diff --git a/cpp/include/cudf_test/detail/column_utilities.hpp b/cpp/include/cudf_test/detail/column_utilities.hpp
index ddf3b658a86..f8270f61f10 100644
--- a/cpp/include/cudf_test/detail/column_utilities.hpp
+++ b/cpp/include/cudf_test/detail/column_utilities.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -82,4 +82,4 @@ std::vector<std::string> to_strings(cudf::column_view const& col, std::string co
 
 }  // namespace detail
 }  // namespace test
-}  // namespace cudf
\ No newline at end of file
+}  // namespace cudf
diff --git a/cpp/include/cudf_test/stream_checking_resource_adapter.hpp b/cpp/include/cudf_test/stream_checking_resource_adapter.hpp
new file mode 100644
index 00000000000..4a22ff148ae
--- /dev/null
+++ b/cpp/include/cudf_test/stream_checking_resource_adapter.hpp
@@ -0,0 +1,166 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+/**
+ * @brief Resource that verifies that the default stream is not used in any allocation.
+ *
+ * @tparam Upstream Type of the upstream resource used for
+ * allocation/deallocation.
+ */
+template <typename Upstream>
+class stream_checking_resource_adaptor final : public rmm::mr::device_memory_resource {
+ public:
+  /**
+   * @brief Construct a new adaptor.
+   *
+   * @throws `cudf::logic_error` if `upstream == nullptr`
+   *
+   * @param upstream The resource used for allocating/deallocating device memory
+   */
+  stream_checking_resource_adaptor(Upstream* upstream) : upstream_{upstream}
+  {
+    CUDF_EXPECTS(nullptr != upstream, "Unexpected null upstream resource pointer.");
+  }
+
+  stream_checking_resource_adaptor()                                        = delete;
+  ~stream_checking_resource_adaptor() override                              = default;
+  stream_checking_resource_adaptor(stream_checking_resource_adaptor const&) = delete;
+  stream_checking_resource_adaptor& operator=(stream_checking_resource_adaptor const&) = delete;
+  stream_checking_resource_adaptor(stream_checking_resource_adaptor&&) noexcept        = default;
+  stream_checking_resource_adaptor& operator=(stream_checking_resource_adaptor&&) noexcept =
+    default;
+
+  /**
+   * @brief Return pointer to the upstream resource.
+   *
+   * @return Pointer to the upstream resource.
+   */
+  Upstream* get_upstream() const noexcept { return upstream_; }
+
+  /**
+   * @brief Checks whether the upstream resource supports streams.
+   *
+   * @return Whether or not the upstream resource supports streams
+   */
+  bool supports_streams() const noexcept override { return upstream_->supports_streams(); }
+
+  /**
+   * @brief Query whether the resource supports the get_mem_info API.
+   *
+   * @return Whether or not the upstream resource supports get_mem_info
+   */
+  bool supports_get_mem_info() const noexcept override
+  {
+    return upstream_->supports_get_mem_info();
+  }
+
+ private:
+  /**
+   * @brief Allocates memory of size at least `bytes` using the upstream
+   * resource as long as it fits inside the allocation limit.
+   *
+   * The returned pointer has at least 256B alignment.
+   *
+   * @throws `rmm::bad_alloc` if the requested allocation could not be fulfilled
+   * by the upstream resource.
+   * @throws `cudf::logic_error` if attempted on a default stream
+   *
+   * @param bytes The size, in bytes, of the allocation
+   * @param stream Stream on which to perform the allocation
+   * @return Pointer to the newly allocated memory
+   */
+  void* do_allocate(std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    verify_non_default_stream(stream);
+    return upstream_->allocate(bytes, stream);
+  }
+
+  /**
+   * @brief Free allocation of size `bytes` pointed to by `ptr`
+   *
+   * @throws `cudf::logic_error` if attempted on a default stream
+   *
+   * @param ptr Pointer to be deallocated
+   * @param bytes Size of the allocation
+   * @param stream Stream on which to perform the deallocation
+   */
+  void do_deallocate(void* ptr, std::size_t bytes, rmm::cuda_stream_view stream) override
+  {
+    verify_non_default_stream(stream);
+    upstream_->deallocate(ptr, bytes, stream);
+  }
+
+  /**
+   * @brief Compare the upstream resource to another.
+   *
+   * @param other The other resource to compare to
+   * @return Whether or not the two resources are equivalent
+   */
+  bool do_is_equal(device_memory_resource const& other) const noexcept override
+  {
+    if (this == &other) { return true; }
+    auto cast = dynamic_cast<stream_checking_resource_adaptor<Upstream> const*>(&other);
+    return cast != nullptr ? upstream_->is_equal(*cast->get_upstream())
+                           : upstream_->is_equal(other);
+  }
+
+  /**
+   * @brief Get free and available memory from upstream resource.
+   *
+   * @throws `rmm::cuda_error` if unable to retrieve memory info.
+   * @throws `cudf::logic_error` if attempted on a default stream
+   *
+   * @param stream Stream on which to get the mem info.
+   * @return std::pair with available and free memory for resource
+   */
+  std::pair<std::size_t, std::size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override
+  {
+    verify_non_default_stream(stream);
+    return upstream_->get_mem_info(stream);
+  }
+
+  /**
+   * @brief Throw an error if given one of CUDA's default stream specifiers.
+   *
+   * @throws `std::runtime_error` if provided a default stream
+   */
+  void verify_non_default_stream(rmm::cuda_stream_view const stream) const
+  {
+    auto cstream{stream.value()};
+    if (cstream == cudaStreamDefault || (cstream == cudaStreamLegacy) ||
+        (cstream == cudaStreamPerThread)) {
+      throw std::runtime_error("Attempted to perform an operation on a default stream!");
+    }
+  }
+
+  Upstream* upstream_;  // the upstream resource used for satisfying allocation requests
+};
+
+/**
+ * @brief Convenience factory to return a `stream_checking_resource_adaptor` around the
+ * upstream resource `upstream`.
+ *
+ * @tparam Upstream Type of the upstream `device_memory_resource`.
+ * @param upstream Pointer to the upstream resource
+ */
+template <typename Upstream>
+stream_checking_resource_adaptor<Upstream> make_stream_checking_resource_adaptor(Upstream* upstream)
+{
+  return stream_checking_resource_adaptor<Upstream>{upstream};
+}
diff --git a/cpp/include/cudf_test/tdigest_utilities.cuh b/cpp/include/cudf_test/tdigest_utilities.cuh
index 6f206a789fd..ce45ad91be1 100644
--- a/cpp/include/cudf_test/tdigest_utilities.cuh
+++ b/cpp/include/cudf_test/tdigest_utilities.cuh
@@ -16,16 +16,14 @@
 
 #pragma once
 
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/unary.hpp>
 #include <cudf/groupby.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <cudf_test/column_wrapper.hpp>
-
-#include <tests/groupby/groupby_test_util.hpp>
-
 #include <thrust/device_vector.h>
 #include <thrust/execution_policy.h>
 #include <thrust/extrema.h>
@@ -102,6 +100,58 @@ struct tdigest_gen {
   // @endcond
 };
 
+template <typename T>
+inline T frand()
+{
+  return static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
+}
+
+template <typename T>
+inline T rand_range(T min, T max)
+{
+  return min + static_cast<T>(frand<T>() * (max - min));
+}
+
+inline std::unique_ptr<column> generate_typed_percentile_distribution(
+  std::vector<double> const& buckets,
+  std::vector<int> const& sizes,
+  data_type t,
+  bool sorted = false)
+{
+  srand(0);
+
+  std::vector<double> values;
+  size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0);
+  values.reserve(total_size);
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    double min = idx == 0 ? 0.0f : buckets[idx - 1];
+    double max = buckets[idx];
+
+    for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) {
+      values.push_back(rand_range(min, max));
+    }
+  }
+
+  if (sorted) { std::sort(values.begin(), values.end()); }
+
+  cudf::test::fixed_width_column_wrapper<double> src(values.begin(), values.end());
+  return cudf::cast(src, t);
+}
+
+// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent
+// is to provide a standardized set of inputs for use with tdigest generation tests and
+// percentile_approx tests. std::vector<double>
+// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector<int>
+// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
+inline std::unique_ptr<column> generate_standardized_percentile_distribution(
+  data_type t = data_type{type_id::FLOAT64}, bool sorted = false)
+{
+  std::vector<double> buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f};
+  std::vector<int> b_sizes{
+    50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
+  return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted);
+}
+
 /**
  * @brief Compare a tdigest column against a sampling of expected values.
  */
@@ -118,11 +168,11 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv,
   // verify min/max
   thrust::host_vector<device_span<T const>> h_spans;
   h_spans.push_back({input_values.begin<T>(), static_cast<size_t>(input_values.size())});
-  thrust::device_vector<device_span<T const>> spans(h_spans);
+  auto spans = cudf::detail::make_device_uvector_async(h_spans, cudf::get_default_stream());
 
   auto expected_min = cudf::make_fixed_width_column(
     data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED);
-  thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     spans.begin(),
                     spans.end(),
                     expected_min->mutable_view().template begin<double>(),
@@ -132,7 +182,7 @@ void tdigest_minmax_compare(cudf::tdigest::tdigest_column_view const& tdv,
 
   auto expected_max = cudf::make_fixed_width_column(
     data_type{type_id::FLOAT64}, spans.size(), mask_state::UNALLOCATED);
-  thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     spans.begin(),
                     spans.end(),
                     expected_max->mutable_view().template begin<double>(),
@@ -217,7 +267,7 @@ void tdigest_simple_all_nulls_aggregation(Func op)
     static_cast<column_view>(values).type(), tdigest_gen{}, op, values, delta);
 
   // NOTE: an empty tdigest column still has 1 row.
-  auto expected = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -508,9 +558,9 @@ template <typename MergeFunc>
 void tdigest_merge_empty(MergeFunc merge_op)
 {
   // 3 empty tdigests all in the same group
-  auto a = cudf::detail::tdigest::make_empty_tdigest_column();
-  auto b = cudf::detail::tdigest::make_empty_tdigest_column();
-  auto c = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto b = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
   std::vector<column_view> cols;
   cols.push_back(*a);
   cols.push_back(*b);
@@ -520,7 +570,7 @@ void tdigest_merge_empty(MergeFunc merge_op)
   auto const delta = 1000;
   auto result      = merge_op(*values, delta);
 
-  auto expected = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto expected = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*expected, *result);
 }
diff --git a/cpp/include/doxygen_groups.h b/cpp/include/doxygen_groups.h
index c0ea06959b2..5c335b720d5 100644
--- a/cpp/include/doxygen_groups.h
+++ b/cpp/include/doxygen_groups.h
@@ -129,6 +129,7 @@
  *   @defgroup strings_replace Replacing
  *   @defgroup strings_split Splitting
  *   @defgroup strings_json JSON
+ *   @defgroup strings_regex Regex
  * @}
  * @defgroup dictionary_apis Dictionary
  * @{
diff --git a/cpp/include/nvtext/bpe_tokenize.hpp b/cpp/include/nvtext/bpe_tokenize.hpp
index 97e354cb39b..b93d93b07c6 100644
--- a/cpp/include/nvtext/bpe_tokenize.hpp
+++ b/cpp/include/nvtext/bpe_tokenize.hpp
@@ -46,7 +46,7 @@ struct bpe_merge_pairs {
    * @param mr Device memory resource used to allocate the device memory
    */
   bpe_merge_pairs(std::unique_ptr<cudf::column>&& input,
-                  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   /**
@@ -57,7 +57,7 @@ struct bpe_merge_pairs {
    * @param mr Device memory resource used to allocate the device memory
    */
   bpe_merge_pairs(cudf::strings_column_view const& input,
-                  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+                  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
                   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
   ~bpe_merge_pairs();
diff --git a/cpp/include/nvtext/detail/tokenize.hpp b/cpp/include/nvtext/detail/tokenize.hpp
index 2b5d0bb855e..38b49e63590 100644
--- a/cpp/include/nvtext/detail/tokenize.hpp
+++ b/cpp/include/nvtext/detail/tokenize.hpp
@@ -35,11 +35,10 @@ namespace detail {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings columns of tokens.
  */
-std::unique_ptr<cudf::column> tokenize(
-  cudf::strings_column_view const& strings,
-  cudf::string_scalar const& delimiter = cudf::string_scalar{""},
-  rmm::cuda_stream_view stream         = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
+                                       cudf::string_scalar const& delimiter,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc nvtext::tokenize(strings_column_view const&,strings_column_view
@@ -51,11 +50,10 @@ std::unique_ptr<cudf::column> tokenize(
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New strings columns of tokens.
  */
-std::unique_ptr<cudf::column> tokenize(
-  cudf::strings_column_view const& strings,
-  cudf::strings_column_view const& delimiters,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
+                                       cudf::strings_column_view const& delimiters,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc nvtext::count_tokens(strings_column_view const&, string_scalar
@@ -68,11 +66,10 @@ std::unique_ptr<cudf::column> tokenize(
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New INT32 column of token counts.
  */
-std::unique_ptr<cudf::column> count_tokens(
-  cudf::strings_column_view const& strings,
-  cudf::string_scalar const& delimiter = cudf::string_scalar{""},
-  rmm::cuda_stream_view stream         = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
+                                           cudf::string_scalar const& delimiter,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 /**
  * @copydoc nvtext::count_tokens(strings_column_view const&,strings_column_view
@@ -84,11 +81,10 @@ std::unique_ptr<cudf::column> count_tokens(
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return New INT32 column of token counts.
  */
-std::unique_ptr<cudf::column> count_tokens(
-  cudf::strings_column_view const& strings,
-  cudf::strings_column_view const& delimiters,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
+                                           cudf::strings_column_view const& delimiters,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace nvtext
diff --git a/cpp/libcudf_kafka/CMakeLists.txt b/cpp/libcudf_kafka/CMakeLists.txt
index 7926c8ad9df..da3b6b8af62 100644
--- a/cpp/libcudf_kafka/CMakeLists.txt
+++ b/cpp/libcudf_kafka/CMakeLists.txt
@@ -22,7 +22,7 @@ include(rapids-find)
 
 project(
   CUDA_KAFKA
-  VERSION 22.10.01
+  VERSION 22.12.00
   LANGUAGES CXX
 )
 
diff --git a/cpp/scripts/run-cmake-format.sh b/cpp/scripts/run-cmake-format.sh
index b9157c76492..f3e21779aa5 100755
--- a/cpp/scripts/run-cmake-format.sh
+++ b/cpp/scripts/run-cmake-format.sh
@@ -17,7 +17,7 @@
 # and exits gracefully if the file is not found. If a user wishes to specify a
 # config file at a nonstandard location, they may do so by setting the
 # environment variable RAPIDS_CMAKE_FORMAT_FILE.
-# 
+#
 # This script can be invoked directly anywhere within the project repository.
 # Alternatively, it may be invoked as a pre-commit hook via
 # `pre-commit run (cmake-format)|(cmake-lint)`.
diff --git a/cpp/src/binaryop/binaryop.cpp b/cpp/src/binaryop/binaryop.cpp
index 4b79cc0581a..83ad8aa4cee 100644
--- a/cpp/src/binaryop/binaryop.cpp
+++ b/cpp/src/binaryop/binaryop.cpp
@@ -406,7 +406,7 @@ std::unique_ptr<column> binary_operation(scalar const& lhs,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::default_stream_value, mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
 }
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          scalar const& rhs,
@@ -415,7 +415,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::default_stream_value, mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
 }
 std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          column_view const& rhs,
@@ -424,7 +424,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, op, output_type, cudf::default_stream_value, mr);
+  return detail::binary_operation(lhs, rhs, op, output_type, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> binary_operation(column_view const& lhs,
@@ -434,7 +434,7 @@ std::unique_ptr<column> binary_operation(column_view const& lhs,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::default_stream_value, mr);
+  return detail::binary_operation(lhs, rhs, ptx, output_type, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/binaryop/compiled/binary_ops.hpp b/cpp/src/binaryop/compiled/binary_ops.hpp
index 1f711b7c899..c51993409ef 100644
--- a/cpp/src/binaryop/compiled/binary_ops.hpp
+++ b/cpp/src/binaryop/compiled/binary_ops.hpp
@@ -37,7 +37,7 @@ std::unique_ptr<column> string_null_min_max(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> string_null_min_max(
@@ -45,7 +45,7 @@ std::unique_ptr<column> string_null_min_max(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<column> string_null_min_max(
@@ -53,7 +53,7 @@ std::unique_ptr<column> string_null_min_max(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -80,7 +80,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -107,7 +107,7 @@ std::unique_ptr<column> binary_operation(
   scalar const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 /**
@@ -133,7 +133,7 @@ std::unique_ptr<column> binary_operation(
   column_view const& rhs,
   binary_operator op,
   data_type output_type,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 void binary_operation(mutable_column_view& out,
diff --git a/cpp/src/binaryop/compiled/struct_binary_ops.cuh b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
index def9ebcef97..2fcf1ce4e32 100644
--- a/cpp/src/binaryop/compiled/struct_binary_ops.cuh
+++ b/cpp/src/binaryop/compiled/struct_binary_ops.cuh
@@ -71,7 +71,7 @@ void apply_struct_binary_op(mutable_column_view& out,
                             bool is_lhs_scalar,
                             bool is_rhs_scalar,
                             PhysicalElementComparator comparator = {},
-                            rmm::cuda_stream_view stream         = cudf::default_stream_value)
+                            rmm::cuda_stream_view stream         = cudf::get_default_stream())
 {
   auto const compare_orders = std::vector<order>(
     lhs.size(),
@@ -115,7 +115,7 @@ void apply_struct_equality_op(mutable_column_view& out,
                               bool is_rhs_scalar,
                               binary_operator op,
                               PhysicalEqualityComparator comparator = {},
-                              rmm::cuda_stream_view stream          = cudf::default_stream_value)
+                              rmm::cuda_stream_view stream          = cudf::get_default_stream())
 {
   CUDF_EXPECTS(op == binary_operator::EQUAL || op == binary_operator::NOT_EQUAL ||
                  op == binary_operator::NULL_EQUALS,
diff --git a/cpp/src/bitmask/null_mask.cu b/cpp/src/bitmask/null_mask.cu
index 4c9151533c2..958bf21e6df 100644
--- a/cpp/src/bitmask/null_mask.cu
+++ b/cpp/src/bitmask/null_mask.cu
@@ -158,14 +158,14 @@ rmm::device_buffer create_null_mask(size_type size,
                                     mask_state state,
                                     rmm::mr::device_memory_resource* mr)
 {
-  return detail::create_null_mask(size, state, cudf::default_stream_value, mr);
+  return detail::create_null_mask(size, state, cudf::get_default_stream(), mr);
 }
 
 // Set pre-allocated null mask of given bit range [begin_bit, end_bit) to valid, if valid==true,
 // or null, otherwise;
 void set_null_mask(bitmask_type* bitmask, size_type begin_bit, size_type end_bit, bool valid)
 {
-  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid);
+  return detail::set_null_mask(bitmask, begin_bit, end_bit, valid, cudf::get_default_stream());
 }
 
 namespace detail {
@@ -510,25 +510,25 @@ rmm::device_buffer copy_bitmask(bitmask_type const* mask,
                                 size_type end_bit,
                                 rmm::mr::device_memory_resource* mr)
 {
-  return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::default_stream_value, mr);
+  return detail::copy_bitmask(mask, begin_bit, end_bit, cudf::get_default_stream(), mr);
 }
 
 // Create a bitmask from a column view
 rmm::device_buffer copy_bitmask(column_view const& view, rmm::mr::device_memory_resource* mr)
 {
-  return detail::copy_bitmask(view, cudf::default_stream_value, mr);
+  return detail::copy_bitmask(view, cudf::get_default_stream(), mr);
 }
 
 std::pair<rmm::device_buffer, size_type> bitmask_and(table_view const& view,
                                                      rmm::mr::device_memory_resource* mr)
 {
-  return detail::bitmask_and(view, cudf::default_stream_value, mr);
+  return detail::bitmask_and(view, cudf::get_default_stream(), mr);
 }
 
 std::pair<rmm::device_buffer, size_type> bitmask_or(table_view const& view,
                                                     rmm::mr::device_memory_resource* mr)
 {
-  return detail::bitmask_or(view, cudf::default_stream_value, mr);
+  return detail::bitmask_or(view, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/column/column.cu b/cpp/src/column/column.cu
index 61dfea6c26a..7b862373a5b 100644
--- a/cpp/src/column/column.cu
+++ b/cpp/src/column/column.cu
@@ -144,7 +144,7 @@ size_type column::null_count() const
   CUDF_FUNC_RANGE();
   if (_null_count <= cudf::UNKNOWN_NULL_COUNT) {
     _null_count = cudf::detail::null_count(
-      static_cast<bitmask_type const*>(_null_mask.data()), 0, size(), cudf::default_stream_value);
+      static_cast<bitmask_type const*>(_null_mask.data()), 0, size(), cudf::get_default_stream());
   }
   return _null_count;
 }
@@ -182,7 +182,7 @@ void column::set_null_count(size_type new_null_count)
 namespace {
 struct create_column_from_view {
   cudf::column_view view;
-  rmm::cuda_stream_view stream{cudf::default_stream_value};
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
   rmm::mr::device_memory_resource* mr;
 
   template <typename ColumnType,
diff --git a/cpp/src/column/column_factories.cpp b/cpp/src/column/column_factories.cpp
index 098e0d3e2cc..5f455e26e52 100644
--- a/cpp/src/column/column_factories.cpp
+++ b/cpp/src/column/column_factories.cpp
@@ -79,6 +79,7 @@ std::unique_ptr<column> make_numeric_column(data_type type,
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_numeric(type), "Invalid, non-numeric type.");
+  CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
   return std::make_unique<column>(type,
                                   size,
@@ -97,6 +98,7 @@ std::unique_ptr<column> make_fixed_point_column(data_type type,
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_fixed_point(type), "Invalid, non-fixed_point type.");
+  CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
   return std::make_unique<column>(type,
                                   size,
@@ -115,6 +117,7 @@ std::unique_ptr<column> make_timestamp_column(data_type type,
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_timestamp(type), "Invalid, non-timestamp type.");
+  CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
   return std::make_unique<column>(type,
                                   size,
@@ -133,6 +136,7 @@ std::unique_ptr<column> make_duration_column(data_type type,
 {
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(is_duration(type), "Invalid, non-duration type.");
+  CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
 
   return std::make_unique<column>(type,
                                   size,
@@ -166,6 +170,7 @@ std::unique_ptr<column> make_dictionary_from_scalar(scalar const& s,
                                                     rmm::mr::device_memory_resource* mr)
 {
   if (size == 0) return make_empty_column(type_id::DICTIONARY32);
+  CUDF_EXPECTS(size >= 0, "Column size cannot be negative.");
   CUDF_EXPECTS(s.is_valid(stream), "cannot create a dictionary with a null key");
   return make_dictionary_column(
     make_column_from_scalar(s, 1, stream, mr),
diff --git a/cpp/src/column/column_factories.cu b/cpp/src/column/column_factories.cu
index 90252fd6cf1..c401b765f0b 100644
--- a/cpp/src/column/column_factories.cu
+++ b/cpp/src/column/column_factories.cu
@@ -54,21 +54,15 @@ std::unique_ptr<cudf::column> column_from_scalar_dispatch::operator()<cudf::stri
   rmm::mr::device_memory_resource* mr) const
 {
   if (size == 0) return make_empty_column(value.type());
-  auto null_mask = detail::create_null_mask(size, mask_state::ALL_NULL, stream, mr);
 
-  if (!value.is_valid(stream))
-    return std::make_unique<column>(
-      value.type(), size, rmm::device_buffer{}, std::move(null_mask), size);
-
-  // Create a strings column_view with all nulls and no children.
   // Since we are setting every row to the scalar, the fill() never needs to access
   // any of the children in the strings column which would otherwise cause an exception.
-  column_view sc{
-    data_type{type_id::STRING}, size, nullptr, static_cast<bitmask_type*>(null_mask.data()), size};
+  column_view sc{value.type(), size, nullptr};
   auto& sv = static_cast<scalar_type_t<cudf::string_view> const&>(value);
+
   // fill the column with the scalar
   auto output = strings::detail::fill(strings_column_view(sc), 0, size, sv, stream, mr);
-  output->set_null_mask(rmm::device_buffer{}, 0);  // should be no nulls
+
   return output;
 }
 
diff --git a/cpp/src/column/column_view.cpp b/cpp/src/column/column_view.cpp
index 2ff088a3f20..3e18b9734f6 100644
--- a/cpp/src/column/column_view.cpp
+++ b/cpp/src/column/column_view.cpp
@@ -68,7 +68,7 @@ size_type column_view_base::null_count() const
 {
   if (_null_count <= cudf::UNKNOWN_NULL_COUNT) {
     _null_count = cudf::detail::null_count(
-      null_mask(), offset(), offset() + size(), cudf::default_stream_value);
+      null_mask(), offset(), offset() + size(), cudf::get_default_stream());
   }
   return _null_count;
 }
@@ -79,7 +79,7 @@ size_type column_view_base::null_count(size_type begin, size_type end) const
   return (null_count() == 0)
            ? 0
            : cudf::detail::null_count(
-               null_mask(), offset() + begin, offset() + end, cudf::default_stream_value);
+               null_mask(), offset() + begin, offset() + end, cudf::get_default_stream());
 }
 
 // Struct to use custom hash combine and fold expression
diff --git a/cpp/src/copying/concatenate.cu b/cpp/src/copying/concatenate.cu
index b770eef1c3a..577d6427b19 100644
--- a/cpp/src/copying/concatenate.cu
+++ b/cpp/src/copying/concatenate.cu
@@ -180,10 +180,8 @@ __global__ void fused_concatenate_kernel(column_device_view const* input_views,
   if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
   while (output_index < output_size) {
     // Lookup input index by searching for output index in offsets
-    // thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
-    auto const offset_it =
-      -1 + thrust::upper_bound(
-             thrust::seq, input_offsets, input_offsets + num_input_views, output_index);
+    auto const offset_it            = thrust::prev(thrust::upper_bound(
+      thrust::seq, input_offsets, input_offsets + num_input_views, output_index));
     size_type const partition_index = offset_it - input_offsets;
 
     // Copy input data to output
@@ -557,7 +555,7 @@ rmm::device_buffer concatenate_masks(host_span<column_view const> views,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_masks(views, cudf::default_stream_value, mr);
+  return detail::concatenate_masks(views, cudf::get_default_stream(), mr);
 }
 
 // Concatenates the elements from a vector of column_views
@@ -565,14 +563,14 @@ std::unique_ptr<column> concatenate(host_span<column_view const> columns_to_conc
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(columns_to_concat, cudf::default_stream_value, mr);
+  return detail::concatenate(columns_to_concat, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> concatenate(host_span<table_view const> tables_to_concat,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate(tables_to_concat, cudf::default_stream_value, mr);
+  return detail::concatenate(tables_to_concat, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/contiguous_split.cu b/cpp/src/copying/contiguous_split.cu
index 0c90eb539fc..c52ca1f74df 100644
--- a/cpp/src/copying/contiguous_split.cu
+++ b/cpp/src/copying/contiguous_split.cu
@@ -1269,7 +1269,7 @@ std::vector<packed_table> contiguous_split(cudf::table_view const& input,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contiguous_split(input, splits, cudf::default_stream_value, mr);
+  return detail::contiguous_split(input, splits, cudf::get_default_stream(), mr);
 }
 
 };  // namespace cudf
diff --git a/cpp/src/copying/copy.cpp b/cpp/src/copying/copy.cpp
index d9a16315488..00147277231 100644
--- a/cpp/src/copying/copy.cpp
+++ b/cpp/src/copying/copy.cpp
@@ -183,7 +183,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::allocate_like(input, input.size(), mask_alloc, cudf::default_stream_value, mr);
+  return detail::allocate_like(input, input.size(), mask_alloc, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> allocate_like(column_view const& input,
@@ -192,7 +192,7 @@ std::unique_ptr<column> allocate_like(column_view const& input,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::allocate_like(input, size, mask_alloc, cudf::default_stream_value, mr);
+  return detail::allocate_like(input, size, mask_alloc, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/copy.cu b/cpp/src/copying/copy.cu
index 5585eac923c..0978cf441d8 100644
--- a/cpp/src/copying/copy.cu
+++ b/cpp/src/copying/copy.cu
@@ -180,7 +180,6 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::column_view const& lh
     table_view{std::vector<column_view>{scatter_src_lhs->get_column(0).view()}},
     gather_map,
     table_view{std::vector<column_view>{rhs}},
-    false,
     stream,
     mr);
 
@@ -208,12 +207,8 @@ std::unique_ptr<column> scatter_gather_based_if_else(cudf::scalar const& lhs,
                                                    static_cast<cudf::size_type>(scatter_map_size),
                                                    scatter_map.begin()};
 
-  auto result = cudf::detail::scatter(scatter_source,
-                                      scatter_map_column_view,
-                                      table_view{std::vector<column_view>{rhs}},
-                                      false,
-                                      stream,
-                                      mr);
+  auto result = cudf::detail::scatter(
+    scatter_source, scatter_map_column_view, table_view{std::vector<column_view>{rhs}}, stream, mr);
 
   return std::move(result->release()[0]);
 }
@@ -415,7 +410,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::default_stream_value, mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
@@ -424,7 +419,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::default_stream_value, mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> copy_if_else(column_view const& lhs,
@@ -433,7 +428,7 @@ std::unique_ptr<column> copy_if_else(column_view const& lhs,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::default_stream_value, mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> copy_if_else(scalar const& lhs,
@@ -442,7 +437,7 @@ std::unique_ptr<column> copy_if_else(scalar const& lhs,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::default_stream_value, mr);
+  return detail::copy_if_else(lhs, rhs, boolean_mask, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/copy_range.cu b/cpp/src/copying/copy_range.cu
index 080a8f645bd..dbcae354384 100644
--- a/cpp/src/copying/copy_range.cu
+++ b/cpp/src/copying/copy_range.cu
@@ -172,7 +172,8 @@ std::unique_ptr<cudf::column> out_of_place_copy_range_dispatch::operator()<cudf:
   auto target_matched =
     cudf::dictionary::detail::add_keys(dict_target, dict_source.keys(), stream, mr);
   auto const target_view = cudf::dictionary_column_view(target_matched->view());
-  auto source_matched = cudf::dictionary::detail::set_keys(dict_source, target_view.keys(), stream);
+  auto source_matched    = cudf::dictionary::detail::set_keys(
+    dict_source, target_view.keys(), stream, rmm::mr::get_current_device_resource());
   auto const source_view = cudf::dictionary_column_view(source_matched->view());
 
   // build the new indices by calling in_place_copy_range on just the indices
@@ -274,7 +275,7 @@ void copy_range_in_place(column_view const& source,
 {
   CUDF_FUNC_RANGE();
   return detail::copy_range_in_place(
-    source, target, source_begin, source_end, target_begin, cudf::default_stream_value);
+    source, target, source_begin, source_end, target_begin, cudf::get_default_stream());
 }
 
 std::unique_ptr<column> copy_range(column_view const& source,
@@ -286,7 +287,7 @@ std::unique_ptr<column> copy_range(column_view const& source,
 {
   CUDF_FUNC_RANGE();
   return detail::copy_range(
-    source, target, source_begin, source_end, target_begin, cudf::default_stream_value, mr);
+    source, target, source_begin, source_end, target_begin, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/gather.cu b/cpp/src/copying/gather.cu
index d00d3a2a43e..93d05757722 100644
--- a/cpp/src/copying/gather.cu
+++ b/cpp/src/copying/gather.cu
@@ -85,7 +85,7 @@ std::unique_ptr<table> gather(table_view const& source_table,
                                                      : detail::negative_index_policy::ALLOWED;
 
   return detail::gather(
-    source_table, gather_map, bounds_policy, index_policy, cudf::default_stream_value, mr);
+    source_table, gather_map, bounds_policy, index_policy, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/get_element.cu b/cpp/src/copying/get_element.cu
index f12b4639b25..5e76b4adbbe 100644
--- a/cpp/src/copying/get_element.cu
+++ b/cpp/src/copying/get_element.cu
@@ -210,7 +210,7 @@ std::unique_ptr<scalar> get_element(column_view const& input,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_element(input, index, cudf::default_stream_value, mr);
+  return detail::get_element(input, index, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/pack.cpp b/cpp/src/copying/pack.cpp
index 5bc425ab7f5..427f2dfdade 100644
--- a/cpp/src/copying/pack.cpp
+++ b/cpp/src/copying/pack.cpp
@@ -219,7 +219,7 @@ table_view unpack(uint8_t const* metadata, uint8_t const* gpu_data)
 packed_columns pack(cudf::table_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::pack(input, cudf::default_stream_value, mr);
+  return detail::pack(input, cudf::get_default_stream(), mr);
 }
 
 /**
diff --git a/cpp/src/copying/purge_nonempty_nulls.cu b/cpp/src/copying/purge_nonempty_nulls.cu
index 30de538ec7a..5bdf10c8af6 100644
--- a/cpp/src/copying/purge_nonempty_nulls.cu
+++ b/cpp/src/copying/purge_nonempty_nulls.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 #include <cudf/copying.hpp>
-#include <cudf/detail/copy.cuh>
+#include <cudf/detail/gather.cuh>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <thrust/count.h>
@@ -80,6 +80,24 @@ bool has_nonempty_nulls(cudf::column_view const& input, rmm::cuda_stream_view st
 
   return false;
 }
+
+std::unique_ptr<column> purge_nonempty_nulls(column_view const& input,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  // If not compound types (LIST/STRING/STRUCT/DICTIONARY) then just copy the input into output.
+  if (!cudf::is_compound(input.type())) { return std::make_unique<column>(input, stream, mr); }
+
+  // Implement via identity gather.
+  auto gathered_table = cudf::detail::gather(table_view{{input}},
+                                             thrust::make_counting_iterator(0),
+                                             thrust::make_counting_iterator(input.size()),
+                                             out_of_bounds_policy::DONT_CHECK,
+                                             stream,
+                                             mr);
+  return std::move(gathered_table->release().front());
+}
+
 }  // namespace detail
 
 /**
@@ -104,33 +122,18 @@ bool may_have_nonempty_nulls(column_view const& input)
 /**
  * @copydoc cudf::has_nonempty_nulls
  */
-bool has_nonempty_nulls(column_view const& input) { return detail::has_nonempty_nulls(input); }
-
-/**
- * @copydoc cudf::purge_nonempty_nulls(lists_column_view const&, rmm::mr::device_memory_resource*)
- */
-std::unique_ptr<cudf::column> purge_nonempty_nulls(lists_column_view const& input,
-                                                   rmm::mr::device_memory_resource* mr)
-{
-  return detail::purge_nonempty_nulls(input, cudf::default_stream_value, mr);
-}
-
-/**
- * @copydoc cudf::purge_nonempty_nulls(structs_column_view const&, rmm::mr::device_memory_resource*)
- */
-std::unique_ptr<cudf::column> purge_nonempty_nulls(structs_column_view const& input,
-                                                   rmm::mr::device_memory_resource* mr)
+bool has_nonempty_nulls(column_view const& input)
 {
-  return detail::purge_nonempty_nulls(input, cudf::default_stream_value, mr);
+  return detail::has_nonempty_nulls(input, cudf::get_default_stream());
 }
 
 /**
- * @copydoc cudf::purge_nonempty_nulls(strings_column_view const&, rmm::mr::device_memory_resource*)
+ * @copydoc cudf::purge_nonempty_nulls(column_view const&, rmm::mr::device_memory_resource*)
  */
-std::unique_ptr<cudf::column> purge_nonempty_nulls(strings_column_view const& input,
+std::unique_ptr<cudf::column> purge_nonempty_nulls(column_view const& input,
                                                    rmm::mr::device_memory_resource* mr)
 {
-  return detail::purge_nonempty_nulls(input, cudf::default_stream_value, mr);
+  return detail::purge_nonempty_nulls(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/reverse.cu b/cpp/src/copying/reverse.cu
index a1ffa115ad1..cf8ca7d9a92 100644
--- a/cpp/src/copying/reverse.cu
+++ b/cpp/src/copying/reverse.cu
@@ -57,13 +57,13 @@ std::unique_ptr<column> reverse(column_view const& source_column,
 std::unique_ptr<table> reverse(table_view const& source_table, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reverse(source_table, cudf::default_stream_value, mr);
+  return detail::reverse(source_table, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> reverse(column_view const& source_column,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reverse(source_column, cudf::default_stream_value, mr);
+  return detail::reverse(source_column, cudf::get_default_stream(), mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/copying/sample.cu b/cpp/src/copying/sample.cu
index 9a164bd053a..27a3f145caa 100644
--- a/cpp/src/copying/sample.cu
+++ b/cpp/src/copying/sample.cu
@@ -93,6 +93,6 @@ std::unique_ptr<table> sample(table_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sample(input, n, replacement, seed, cudf::default_stream_value, mr);
+  return detail::sample(input, n, replacement, seed, cudf::get_default_stream(), mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/copying/scatter.cu b/cpp/src/copying/scatter.cu
index 79c27816009..6083a698560 100644
--- a/cpp/src/copying/scatter.cu
+++ b/cpp/src/copying/scatter.cu
@@ -119,7 +119,7 @@ struct column_scalar_scatterer_impl {
     auto scalar_iter =
       thrust::make_permutation_iterator(scalar_impl->data(), thrust::make_constant_iterator(0));
 
-    thrust::scatter(rmm::exec_policy(stream),
+    thrust::scatter(rmm::exec_policy_nosync(stream),
                     scalar_iter,
                     scalar_iter + scatter_rows,
                     scatter_iter,
@@ -184,14 +184,18 @@ struct column_scalar_scatterer_impl<dictionary32, MapIterator> {
                                    stream,
                                    mr);
     auto dict_view    = dictionary_column_view(dict_target->view());
-    auto scalar_index = dictionary::detail::get_index(dict_view, source.get(), stream);
-    auto scalar_iter  = thrust::make_permutation_iterator(
+    auto scalar_index = dictionary::detail::get_index(
+      dict_view, source.get(), stream, rmm::mr::get_current_device_resource());
+    auto scalar_iter = thrust::make_permutation_iterator(
       indexalator_factory::make_input_iterator(*scalar_index), thrust::make_constant_iterator(0));
     auto new_indices = std::make_unique<column>(dict_view.get_indices_annotated(), stream, mr);
     auto target_iter = indexalator_factory::make_output_iterator(new_indices->mutable_view());
 
-    thrust::scatter(
-      rmm::exec_policy(stream), scalar_iter, scalar_iter + scatter_rows, scatter_iter, target_iter);
+    thrust::scatter(rmm::exec_policy_nosync(stream),
+                    scalar_iter,
+                    scalar_iter + scatter_rows,
+                    scatter_iter,
+                    target_iter);
 
     // build the dictionary indices column from the result
     auto const indices_type = new_indices->type();
@@ -285,7 +289,6 @@ struct column_scalar_scatterer_impl<struct_view, MapIterator> {
 std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
-                               bool check_bounds,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
@@ -307,13 +310,12 @@ std::unique_ptr<table> scatter(table_view const& source,
   // create index type normalizing iterator for the scatter_map
   auto map_begin = indexalator_factory::make_input_iterator(scatter_map);
   auto map_end   = map_begin + scatter_map.size();
-  return detail::scatter(source, map_begin, map_end, target, check_bounds, stream, mr);
+  return detail::scatter(source, map_begin, map_end, target, stream, mr);
 }
 
 std::unique_ptr<table> scatter(table_view const& source,
                                device_span<size_type const> const scatter_map,
                                table_view const& target,
-                               bool check_bounds,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
@@ -322,13 +324,12 @@ std::unique_ptr<table> scatter(table_view const& source,
   auto map_col = column_view(data_type{type_to_id<size_type>()},
                              static_cast<size_type>(scatter_map.size()),
                              scatter_map.data());
-  return scatter(source, map_col, target, check_bounds, stream, mr);
+  return scatter(source, map_col, target, stream, mr);
 }
 
 std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
                                column_view const& indices,
                                table_view const& target,
-                               bool check_bounds,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
@@ -340,20 +341,9 @@ std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>>
 
   // Create normalizing iterator for indices column
   auto map_begin = indexalator_factory::make_input_iterator(indices);
-  auto map_end   = map_begin + indices.size();
 
   // Optionally check map index values are within the number of target rows.
   auto const n_rows = target.num_rows();
-  if (check_bounds) {
-    CUDF_EXPECTS(
-      indices.size() == thrust::count_if(rmm::exec_policy(stream),
-                                         map_begin,
-                                         map_end,
-                                         [n_rows] __device__(size_type index) {
-                                           return ((index >= -n_rows) && (index < n_rows));
-                                         }),
-      "Scatter map index out of bounds");
-  }
 
   // Transform negative indices to index + target size
   auto scatter_rows = indices.size();
@@ -396,7 +386,7 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
     data_type{type_id::INT32}, target.size(), mask_state::UNALLOCATED, stream);
   auto mutable_indices = indices->mutable_view();
 
-  thrust::sequence(rmm::exec_policy(stream),
+  thrust::sequence(rmm::exec_policy_nosync(stream),
                    mutable_indices.begin<size_type>(),
                    mutable_indices.end<size_type>(),
                    0);
@@ -404,12 +394,8 @@ std::unique_ptr<column> boolean_mask_scatter(column_view const& input,
   // The scatter map is actually a table with only one column, which is scatter map.
   auto scatter_map =
     detail::apply_boolean_mask(table_view{{indices->view()}}, boolean_mask, stream);
-  auto output_table = detail::scatter(table_view{{input}},
-                                      scatter_map->get_column(0).view(),
-                                      table_view{{target}},
-                                      false,
-                                      stream,
-                                      mr);
+  auto output_table = detail::scatter(
+    table_view{{input}}, scatter_map->get_column(0).view(), table_view{{target}}, stream, mr);
 
   // There is only one column in output_table
   return std::make_unique<column>(std::move(output_table->get_column(0)));
@@ -505,21 +491,19 @@ std::unique_ptr<table> boolean_mask_scatter(
 std::unique_ptr<table> scatter(table_view const& source,
                                column_view const& scatter_map,
                                table_view const& target,
-                               bool check_bounds,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scatter(source, scatter_map, target, check_bounds, cudf::default_stream_value, mr);
+  return detail::scatter(source, scatter_map, target, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> scatter(std::vector<std::reference_wrapper<const scalar>> const& source,
                                column_view const& indices,
                                table_view const& target,
-                               bool check_bounds,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scatter(source, indices, target, check_bounds, cudf::default_stream_value, mr);
+  return detail::scatter(source, indices, target, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
@@ -528,7 +512,7 @@ std::unique_ptr<table> boolean_mask_scatter(table_view const& input,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::boolean_mask_scatter(input, target, boolean_mask, cudf::default_stream_value, mr);
+  return detail::boolean_mask_scatter(input, target, boolean_mask, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> boolean_mask_scatter(
@@ -538,7 +522,7 @@ std::unique_ptr<table> boolean_mask_scatter(
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::boolean_mask_scatter(input, target, boolean_mask, cudf::default_stream_value, mr);
+  return detail::boolean_mask_scatter(input, target, boolean_mask, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/shift.cu b/cpp/src/copying/shift.cu
index 607388cff56..a6126374ed2 100644
--- a/cpp/src/copying/shift.cu
+++ b/cpp/src/copying/shift.cu
@@ -174,7 +174,7 @@ std::unique_ptr<column> shift(column_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::shift(input, offset, fill_value, cudf::default_stream_value, mr);
+  return detail::shift(input, offset, fill_value, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/copying/slice.cu b/cpp/src/copying/slice.cu
index e329756b0df..52410ada128 100644
--- a/cpp/src/copying/slice.cu
+++ b/cpp/src/copying/slice.cu
@@ -114,25 +114,25 @@ std::vector<table_view> slice(table_view const& input,
 std::vector<column_view> slice(column_view const& input, host_span<size_type const> indices)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, cudf::default_stream_value);
+  return detail::slice(input, indices, cudf::get_default_stream());
 }
 
 std::vector<table_view> slice(table_view const& input, host_span<size_type const> indices)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, cudf::default_stream_value);
+  return detail::slice(input, indices, cudf::get_default_stream());
 };
 
 std::vector<column_view> slice(column_view const& input, std::initializer_list<size_type> indices)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, cudf::default_stream_value);
+  return detail::slice(input, indices, cudf::get_default_stream());
 }
 
 std::vector<table_view> slice(table_view const& input, std::initializer_list<size_type> indices)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice(input, indices, cudf::default_stream_value);
+  return detail::slice(input, indices, cudf::get_default_stream());
 };
 
 }  // namespace cudf
diff --git a/cpp/src/copying/split.cpp b/cpp/src/copying/split.cpp
index 19ecd959172..b577886febf 100644
--- a/cpp/src/copying/split.cpp
+++ b/cpp/src/copying/split.cpp
@@ -86,26 +86,26 @@ std::vector<cudf::column_view> split(cudf::column_view const& input,
                                      host_span<size_type const> splits)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(input, splits, cudf::default_stream_value);
+  return detail::split(input, splits, cudf::get_default_stream());
 }
 
 std::vector<cudf::table_view> split(cudf::table_view const& input,
                                     host_span<size_type const> splits)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(input, splits, cudf::default_stream_value);
+  return detail::split(input, splits, cudf::get_default_stream());
 }
 
 std::vector<column_view> split(column_view const& input, std::initializer_list<size_type> splits)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(input, splits, cudf::default_stream_value);
+  return detail::split(input, splits, cudf::get_default_stream());
 }
 
 std::vector<table_view> split(table_view const& input, std::initializer_list<size_type> splits)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(input, splits, cudf::default_stream_value);
+  return detail::split(input, splits, cudf::get_default_stream());
 }
 
 }  // namespace cudf
diff --git a/cpp/src/datetime/datetime_ops.cu b/cpp/src/datetime/datetime_ops.cu
index ee026d6c395..db1d04259b5 100644
--- a/cpp/src/datetime/datetime_ops.cu
+++ b/cpp/src/datetime/datetime_ops.cu
@@ -76,9 +76,22 @@ struct extract_component_operator {
 
     if (time_since_midnight.count() < 0) { time_since_midnight += days(1); }
 
-    auto hrs_  = duration_cast<hours>(time_since_midnight);
-    auto mins_ = duration_cast<minutes>(time_since_midnight - hrs_);
-    auto secs_ = duration_cast<seconds>(time_since_midnight - hrs_ - mins_);
+    auto const hrs_  = [&] { return duration_cast<hours>(time_since_midnight); };
+    auto const mins_ = [&] { return duration_cast<minutes>(time_since_midnight) - hrs_(); };
+    auto const secs_ = [&] {
+      return duration_cast<seconds>(time_since_midnight) - hrs_() - mins_();
+    };
+    auto const millisecs_ = [&] {
+      return duration_cast<milliseconds>(time_since_midnight) - hrs_() - mins_() - secs_();
+    };
+    auto const microsecs_ = [&] {
+      return duration_cast<microseconds>(time_since_midnight) - hrs_() - mins_() - secs_() -
+             millisecs_();
+    };
+    auto const nanosecs_ = [&] {
+      return duration_cast<nanoseconds>(time_since_midnight) - hrs_() - mins_() - secs_() -
+             millisecs_() - microsecs_();
+    };
 
     switch (Component) {
       case datetime_component::YEAR:
@@ -89,9 +102,12 @@ struct extract_component_operator {
         return static_cast<unsigned>(year_month_day(days_since_epoch).day());
       case datetime_component::WEEKDAY:
         return year_month_weekday(days_since_epoch).weekday().iso_encoding();
-      case datetime_component::HOUR: return hrs_.count();
-      case datetime_component::MINUTE: return mins_.count();
-      case datetime_component::SECOND: return secs_.count();
+      case datetime_component::HOUR: return hrs_().count();
+      case datetime_component::MINUTE: return mins_().count();
+      case datetime_component::SECOND: return secs_().count();
+      case datetime_component::MILLISECOND: return millisecs_().count();
+      case datetime_component::MICROSECOND: return microsecs_().count();
+      case datetime_component::NANOSECOND: return nanosecs_().count();
       default: return 0;
     }
   }
@@ -495,6 +511,33 @@ std::unique_ptr<column> extract_second(column_view const& column,
     cudf::type_id::INT16>(column, stream, mr);
 }
 
+std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
+{
+  return detail::apply_datetime_op<
+    detail::extract_component_operator<detail::datetime_component::MILLISECOND>,
+    cudf::type_id::INT16>(column, stream, mr);
+}
+
+std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
+                                                     rmm::cuda_stream_view stream,
+                                                     rmm::mr::device_memory_resource* mr)
+{
+  return detail::apply_datetime_op<
+    detail::extract_component_operator<detail::datetime_component::MICROSECOND>,
+    cudf::type_id::INT16>(column, stream, mr);
+}
+
+std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  return detail::apply_datetime_op<
+    detail::extract_component_operator<detail::datetime_component::NANOSECOND>,
+    cudf::type_id::INT16>(column, stream, mr);
+}
+
 std::unique_ptr<column> last_day_of_month(column_view const& column,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
@@ -540,7 +583,7 @@ std::unique_ptr<column> ceil_datetimes(column_view const& column,
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
-    detail::rounding_function::CEIL, freq, column, cudf::default_stream_value, mr);
+    detail::rounding_function::CEIL, freq, column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> floor_datetimes(column_view const& column,
@@ -549,7 +592,7 @@ std::unique_ptr<column> floor_datetimes(column_view const& column,
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
-    detail::rounding_function::FLOOR, freq, column, cudf::default_stream_value, mr);
+    detail::rounding_function::FLOOR, freq, column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> round_datetimes(column_view const& column,
@@ -558,66 +601,87 @@ std::unique_ptr<column> round_datetimes(column_view const& column,
 {
   CUDF_FUNC_RANGE();
   return detail::round_general(
-    detail::rounding_function::ROUND, freq, column, cudf::default_stream_value, mr);
+    detail::rounding_function::ROUND, freq, column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_year(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_year(column, cudf::default_stream_value, mr);
+  return detail::extract_year(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_month(column_view const& column,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_month(column, cudf::default_stream_value, mr);
+  return detail::extract_month(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_day(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_day(column, cudf::default_stream_value, mr);
+  return detail::extract_day(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_weekday(column_view const& column,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_weekday(column, cudf::default_stream_value, mr);
+  return detail::extract_weekday(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_hour(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_hour(column, cudf::default_stream_value, mr);
+  return detail::extract_hour(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_minute(column_view const& column,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_minute(column, cudf::default_stream_value, mr);
+  return detail::extract_minute(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_second(column_view const& column,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_second(column, cudf::default_stream_value, mr);
+  return detail::extract_second(column, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> extract_millisecond_fraction(column_view const& column,
+                                                     rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract_millisecond_fraction(column, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> extract_microsecond_fraction(column_view const& column,
+                                                     rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract_microsecond_fraction(column, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> extract_nanosecond_fraction(column_view const& column,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract_nanosecond_fraction(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> last_day_of_month(column_view const& column,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::last_day_of_month(column, cudf::default_stream_value, mr);
+  return detail::last_day_of_month(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> day_of_year(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::day_of_year(column, cudf::default_stream_value, mr);
+  return detail::day_of_year(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
@@ -626,7 +690,7 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
 {
   CUDF_FUNC_RANGE();
   return detail::add_calendrical_months(
-    timestamp_column, months_column, cudf::default_stream_value, mr);
+    timestamp_column, months_column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& timestamp_column,
@@ -634,27 +698,27 @@ std::unique_ptr<cudf::column> add_calendrical_months(cudf::column_view const& ti
                                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_calendrical_months(timestamp_column, months, cudf::default_stream_value, mr);
+  return detail::add_calendrical_months(timestamp_column, months, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> is_leap_year(column_view const& column, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_leap_year(column, cudf::default_stream_value, mr);
+  return detail::is_leap_year(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> days_in_month(column_view const& column,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::days_in_month(column, cudf::default_stream_value, mr);
+  return detail::days_in_month(column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> extract_quarter(column_view const& column,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_quarter(column, cudf::default_stream_value, mr);
+  return detail::extract_quarter(column, cudf::get_default_stream(), mr);
 }
 
 }  // namespace datetime
diff --git a/cpp/src/dictionary/add_keys.cu b/cpp/src/dictionary/add_keys.cu
index 3dea491b6e4..486e7d2d24b 100644
--- a/cpp/src/dictionary/add_keys.cu
+++ b/cpp/src/dictionary/add_keys.cu
@@ -44,11 +44,10 @@ namespace detail {
  * d2 is now {[a, b, c, d, e, f], [5, 0, 3, 1, 2, 2, 2, 5, 0]}
  * ```
  */
-std::unique_ptr<column> add_keys(
-  dictionary_column_view const& dictionary_column,
-  column_view const& new_keys,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column,
+                                 column_view const& new_keys,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "Keys must not have nulls");
   auto old_keys = dictionary_column.keys();  // [a,b,c,d,f]
@@ -132,7 +131,7 @@ std::unique_ptr<column> add_keys(dictionary_column_view const& dictionary_column
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::add_keys(dictionary_column, keys, cudf::default_stream_value, mr);
+  return detail::add_keys(dictionary_column, keys, cudf::get_default_stream(), mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/decode.cu b/cpp/src/dictionary/decode.cu
index 22e2ee578a0..01411d06b62 100644
--- a/cpp/src/dictionary/decode.cu
+++ b/cpp/src/dictionary/decode.cu
@@ -68,7 +68,7 @@ std::unique_ptr<column> decode(dictionary_column_view const& source,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::decode(source, cudf::default_stream_value, mr);
+  return detail::decode(source, cudf::get_default_stream(), mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/encode.cu b/cpp/src/dictionary/encode.cu
index 4e8f992b633..fe8e777b694 100644
--- a/cpp/src/dictionary/encode.cu
+++ b/cpp/src/dictionary/encode.cu
@@ -92,7 +92,7 @@ std::unique_ptr<column> encode(column_view const& input_column,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::encode(input_column, indices_type, cudf::default_stream_value, mr);
+  return detail::encode(input_column, indices_type, cudf::get_default_stream(), mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/remove_keys.cu b/cpp/src/dictionary/remove_keys.cu
index 4506ea98ca4..dcb877da686 100644
--- a/cpp/src/dictionary/remove_keys.cu
+++ b/cpp/src/dictionary/remove_keys.cu
@@ -56,11 +56,10 @@ namespace {
  * @param mr Device memory resource used to allocate the returned column's device memory.
  */
 template <typename KeysKeeper>
-std::unique_ptr<column> remove_keys_fn(
-  dictionary_column_view const& dictionary_column,
-  KeysKeeper keys_to_keep_fn,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> remove_keys_fn(dictionary_column_view const& dictionary_column,
+                                       KeysKeeper keys_to_keep_fn,
+                                       rmm::cuda_stream_view stream,
+                                       rmm::mr::device_memory_resource* mr)
 {
   auto const keys_view    = dictionary_column.keys();
   auto const indices_type = dictionary_column.indices().type();
@@ -148,11 +147,10 @@ std::unique_ptr<column> remove_keys_fn(
 
 }  // namespace
 
-std::unique_ptr<column> remove_keys(
-  dictionary_column_view const& dictionary_column,
-  column_view const& keys_to_remove,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_column,
+                                    column_view const& keys_to_remove,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!keys_to_remove.has_nulls(), "keys_to_remove must not have nulls");
   auto const keys_view = dictionary_column.keys();
@@ -166,10 +164,9 @@ std::unique_ptr<column> remove_keys(
   return remove_keys_fn(dictionary_column, key_matcher, stream, mr);
 }
 
-std::unique_ptr<column> remove_unused_keys(
-  dictionary_column_view const& dictionary_column,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   // locate the keys to remove
   auto const keys_size     = dictionary_column.keys_size();
@@ -200,14 +197,14 @@ std::unique_ptr<column> remove_keys(dictionary_column_view const& dictionary_col
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::remove_keys(dictionary_column, keys_to_remove, cudf::default_stream_value, mr);
+  return detail::remove_keys(dictionary_column, keys_to_remove, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> remove_unused_keys(dictionary_column_view const& dictionary_column,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::remove_unused_keys(dictionary_column, cudf::default_stream_value, mr);
+  return detail::remove_unused_keys(dictionary_column, cudf::get_default_stream(), mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/replace.cu b/cpp/src/dictionary/replace.cu
index 4acc2d124b2..7069993866c 100644
--- a/cpp/src/dictionary/replace.cu
+++ b/cpp/src/dictionary/replace.cu
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -123,8 +123,9 @@ std::unique_ptr<column> replace_nulls(dictionary_column_view const& input,
   // first add the replacement to the keys so only the indices need to be processed
   auto input_matched = dictionary::detail::add_keys(
     input, make_column_from_scalar(replacement, 1, stream)->view(), stream, mr);
-  auto const input_view   = dictionary_column_view(input_matched->view());
-  auto const scalar_index = get_index(input_view, replacement, stream);
+  auto const input_view = dictionary_column_view(input_matched->view());
+  auto const scalar_index =
+    get_index(input_view, replacement, stream, rmm::mr::get_current_device_resource());
 
   // now build the new indices by doing replace-null on the updated indices
   auto const input_indices = input_view.get_indices_annotated();
diff --git a/cpp/src/dictionary/search.cu b/cpp/src/dictionary/search.cu
index 3936f7470e5..8e97a387780 100644
--- a/cpp/src/dictionary/search.cu
+++ b/cpp/src/dictionary/search.cu
@@ -79,7 +79,7 @@ struct find_index_fn {
     using ScalarType = cudf::scalar_type_t<Element>;
     auto find_key    = static_cast<ScalarType const&>(key).value(stream);
     auto keys_view   = column_device_view::create(input.keys(), stream);
-    auto iter        = thrust::equal_range(rmm::exec_policy(cudf::default_stream_value),
+    auto iter        = thrust::equal_range(rmm::exec_policy(cudf::get_default_stream()),
                                     keys_view->begin<Element>(),
                                     keys_view->end<Element>(),
                                     find_key);
@@ -179,7 +179,7 @@ std::unique_ptr<scalar> get_index(dictionary_column_view const& dictionary,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_index(dictionary, key, cudf::default_stream_value, mr);
+  return detail::get_index(dictionary, key, cudf::get_default_stream(), mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/dictionary/set_keys.cu b/cpp/src/dictionary/set_keys.cu
index 216f00c90e1..075fb6115e3 100644
--- a/cpp/src/dictionary/set_keys.cu
+++ b/cpp/src/dictionary/set_keys.cu
@@ -116,11 +116,10 @@ struct dispatch_compute_indices {
 }  // namespace
 
 //
-std::unique_ptr<column> set_keys(
-  dictionary_column_view const& dictionary_column,
-  column_view const& new_keys,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column,
+                                 column_view const& new_keys,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(!new_keys.has_nulls(), "keys parameter must not have nulls");
   auto keys = dictionary_column.keys();
@@ -245,14 +244,14 @@ std::unique_ptr<column> set_keys(dictionary_column_view const& dictionary_column
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::set_keys(dictionary_column, keys, cudf::default_stream_value, mr);
+  return detail::set_keys(dictionary_column, keys, cudf::get_default_stream(), mr);
 }
 
 std::vector<std::unique_ptr<column>> match_dictionaries(
   cudf::host_span<dictionary_column_view const> input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::match_dictionaries(input, cudf::default_stream_value, mr);
+  return detail::match_dictionaries(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace dictionary
diff --git a/cpp/src/filling/calendrical_month_sequence.cu b/cpp/src/filling/calendrical_month_sequence.cu
index d4b3e209c4a..f45634a615e 100644
--- a/cpp/src/filling/calendrical_month_sequence.cu
+++ b/cpp/src/filling/calendrical_month_sequence.cu
@@ -43,7 +43,7 @@ std::unique_ptr<cudf::column> calendrical_month_sequence(size_type size,
                                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::calendrical_month_sequence(size, init, months, cudf::default_stream_value, mr);
+  return detail::calendrical_month_sequence(size, init, months, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/filling/fill.cu b/cpp/src/filling/fill.cu
index 2abb0cf9795..dac36032583 100644
--- a/cpp/src/filling/fill.cu
+++ b/cpp/src/filling/fill.cu
@@ -171,7 +171,8 @@ std::unique_ptr<cudf::column> out_of_place_fill_range_dispatch::operator()<cudf:
     cudf::dictionary_column_view(target_matched->view()).get_indices_annotated();
 
   // get the index of the key just added
-  auto index_of_value = cudf::dictionary::detail::get_index(target_matched->view(), value, stream);
+  auto index_of_value = cudf::dictionary::detail::get_index(
+    target_matched->view(), value, stream, rmm::mr::get_current_device_resource());
   // now call fill using just the indices column and the new index
   auto new_indices =
     cudf::type_dispatcher(target_indices.type(),
@@ -248,7 +249,7 @@ void fill_in_place(mutable_column_view& destination,
                    scalar const& value)
 {
   CUDF_FUNC_RANGE();
-  return detail::fill_in_place(destination, begin, end, value, cudf::default_stream_value);
+  return detail::fill_in_place(destination, begin, end, value, cudf::get_default_stream());
 }
 
 std::unique_ptr<column> fill(column_view const& input,
@@ -258,7 +259,7 @@ std::unique_ptr<column> fill(column_view const& input,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::fill(input, begin, end, value, cudf::default_stream_value, mr);
+  return detail::fill(input, begin, end, value, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/filling/repeat.cu b/cpp/src/filling/repeat.cu
index 90c644933ec..8d86a9d9827 100644
--- a/cpp/src/filling/repeat.cu
+++ b/cpp/src/filling/repeat.cu
@@ -103,7 +103,6 @@ namespace cudf {
 namespace detail {
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
-                              bool check_count,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
@@ -112,19 +111,12 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 
   if (input_table.num_rows() == 0) { return cudf::empty_like(input_table); }
 
-  if (check_count) { cudf::type_dispatcher(count.type(), count_checker{count}, stream); }
-
   auto count_iter = cudf::detail::indexalator_factory::make_input_iterator(count);
 
   rmm::device_uvector<cudf::size_type> offsets(count.size(), stream);
   thrust::inclusive_scan(
     rmm::exec_policy(stream), count_iter, count_iter + count.size(), offsets.begin());
 
-  if (check_count) {
-    CUDF_EXPECTS(thrust::is_sorted(rmm::exec_policy(stream), offsets.begin(), offsets.end()),
-                 "count has negative values or the resulting table has too many rows.");
-  }
-
   size_type output_size{offsets.back_element(stream)};
   rmm::device_uvector<size_type> indices(output_size, stream);
   thrust::upper_bound(rmm::exec_policy(stream),
@@ -162,11 +154,10 @@ std::unique_ptr<table> repeat(table_view const& input_table,
 
 std::unique_ptr<table> repeat(table_view const& input_table,
                               column_view const& count,
-                              bool check_count,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat(input_table, count, check_count, cudf::default_stream_value, mr);
+  return detail::repeat(input_table, count, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> repeat(table_view const& input_table,
@@ -174,7 +165,7 @@ std::unique_ptr<table> repeat(table_view const& input_table,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat(input_table, count, cudf::default_stream_value, mr);
+  return detail::repeat(input_table, count, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/filling/sequence.cu b/cpp/src/filling/sequence.cu
index a2ae3b9e70c..284e7c46347 100644
--- a/cpp/src/filling/sequence.cu
+++ b/cpp/src/filling/sequence.cu
@@ -154,7 +154,7 @@ std::unique_ptr<column> sequence(size_type size,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequence(size, init, step, cudf::default_stream_value, mr);
+  return detail::sequence(size, init, step, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> sequence(size_type size,
@@ -162,7 +162,7 @@ std::unique_ptr<column> sequence(size_type size,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequence(size, init, cudf::default_stream_value, mr);
+  return detail::sequence(size, init, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/groupby/groupby.cu b/cpp/src/groupby/groupby.cu
index cd54e921a4c..dde0037a8c3 100644
--- a/cpp/src/groupby/groupby.cu
+++ b/cpp/src/groupby/groupby.cu
@@ -196,7 +196,7 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::aggr
 
   if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); }
 
-  return dispatch_aggregation(requests, cudf::default_stream_value, mr);
+  return dispatch_aggregation(requests, cudf::get_default_stream(), mr);
 }
 
 // Compute scan requests
@@ -214,13 +214,13 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby::scan
 
   if (_keys.num_rows() == 0) { return std::pair(empty_like(_keys), empty_results(requests)); }
 
-  return sort_scan(requests, cudf::default_stream_value, mr);
+  return sort_scan(requests, cudf::get_default_stream(), mr);
 }
 
 groupby::groups groupby::get_groups(table_view values, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
   auto grouped_keys = helper().sorted_keys(stream, mr);
 
   auto const& group_offsets       = helper().group_offsets(stream);
@@ -252,7 +252,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::replace_nulls
                "Size mismatch between num_columns and replace_policies.");
 
   if (values.is_empty()) { return std::pair(empty_like(_keys), empty_like(values)); }
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   auto const& group_labels = helper().group_labels(stream);
   std::vector<std::unique_ptr<column>> results;
@@ -298,7 +298,7 @@ std::pair<std::unique_ptr<table>, std::unique_ptr<table>> groupby::shift(
                 [&](auto i) { return values.column(i).type() == fill_values[i].get().type(); }),
     "values and fill_value should have the same type.");
 
-  auto stream = cudf::default_stream_value;
+  auto stream = cudf::get_default_stream();
   std::vector<std::unique_ptr<column>> results;
   auto const& group_offsets = helper().group_offsets(stream);
   std::transform(
diff --git a/cpp/src/groupby/hash/groupby.cu b/cpp/src/groupby/hash/groupby.cu
index c07833520ab..8410d499f1a 100644
--- a/cpp/src/groupby/hash/groupby.cu
+++ b/cpp/src/groupby/hash/groupby.cu
@@ -512,18 +512,33 @@ rmm::device_uvector<size_type> extract_populated_keys(map_type const& map,
 {
   rmm::device_uvector<size_type> populated_keys(num_keys, stream);
 
-  auto get_key    = [] __device__(auto const& element) { return element.first; };  // first = key
-  auto get_key_it = thrust::make_transform_iterator(map.data(), get_key);
-  auto key_used   = [unused = map.get_unused_key()] __device__(auto key) { return key != unused; };
-
-  auto end_it = thrust::copy_if(rmm::exec_policy(stream),
-                                get_key_it,
-                                get_key_it + map.capacity(),
-                                populated_keys.begin(),
-                                key_used);
-
-  populated_keys.resize(std::distance(populated_keys.begin(), end_it), stream);
+  auto const get_key = [] __device__(auto const& element) { return element.first; };  // first = key
+  auto const key_used = [unused = map.get_unused_key()] __device__(auto key) {
+    return key != unused;
+  };
+  auto key_itr = thrust::make_transform_iterator(map.data(), get_key);
+
+  // thrust::copy_if has a bug where it cannot iterate over int-max values
+  // so if map.capacity() > int-max we'll call thrust::copy_if in chunks instead
+  auto const copy_size =
+    std::min(map.capacity(), static_cast<std::size_t>(std::numeric_limits<int>::max()));
+  auto const key_end = key_itr + map.capacity();
+  auto pop_keys_itr  = populated_keys.begin();
+
+  std::size_t output_size = 0;
+  while (key_itr != key_end) {
+    auto const copy_end = static_cast<std::size_t>(std::distance(key_itr, key_end)) <= copy_size
+                            ? key_end
+                            : key_itr + copy_size;
+    auto const end_it =
+      thrust::copy_if(rmm::exec_policy(stream), key_itr, copy_end, pop_keys_itr, key_used);
+    auto const copied = std::distance(pop_keys_itr, end_it);
+    pop_keys_itr += copied;
+    output_size += copied;
+    key_itr = copy_end;
+  }
 
+  populated_keys.resize(output_size, stream);
   return populated_keys;
 }
 
@@ -653,14 +668,6 @@ std::pair<std::unique_ptr<table>, std::vector<aggregation_result>> groupby(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
-  auto const has_nested_column =
-    std::any_of(keys.begin(), keys.end(), [](cudf::column_view const& col) {
-      return cudf::is_nested(col.type());
-    });
-  if (has_nested_column and include_null_keys == cudf::null_policy::EXCLUDE) {
-    CUDF_FAIL("Null keys of nested type cannot be excluded.");
-  }
-
   cudf::detail::result_cache cache(requests.size());
 
   std::unique_ptr<table> unique_keys =
diff --git a/cpp/src/groupby/sort/aggregate.cpp b/cpp/src/groupby/sort/aggregate.cpp
index 55a0b89e446..e3d14f1deb7 100644
--- a/cpp/src/groupby/sort/aggregate.cpp
+++ b/cpp/src/groupby/sort/aggregate.cpp
@@ -701,7 +701,7 @@ void aggregate_result_functor::operator()<aggregation::TDIGEST>(aggregation cons
 
   cache.add_result(values,
                    agg,
-                   cudf::detail::tdigest::group_tdigest(
+                   cudf::tdigest::detail::group_tdigest(
                      get_sorted_values(),
                      helper.group_offsets(stream),
                      helper.group_labels(stream),
@@ -745,7 +745,7 @@ void aggregate_result_functor::operator()<aggregation::MERGE_TDIGEST>(aggregatio
     dynamic_cast<cudf::detail::merge_tdigest_aggregation const&>(agg).max_centroids;
   cache.add_result(values,
                    agg,
-                   cudf::detail::tdigest::group_merge_tdigest(get_grouped_values(),
+                   cudf::tdigest::detail::group_merge_tdigest(get_grouped_values(),
                                                               helper.group_offsets(stream),
                                                               helper.group_labels(stream),
                                                               helper.num_groups(stream),
diff --git a/cpp/src/groupby/sort/common_utils.cuh b/cpp/src/groupby/sort/common_utils.cuh
new file mode 100644
index 00000000000..fe5d7c325ca
--- /dev/null
+++ b/cpp/src/groupby/sort/common_utils.cuh
@@ -0,0 +1,62 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/types.hpp>
+
+namespace cudf::groupby::detail {
+/**
+ * @brief Functor to compare two rows of a table in given permutation order
+ *
+ * This is useful to identify unique elements in a sorted order table, when the permutation order is
+ * the sorted order of the table.
+ */
+template <typename ComparatorT, typename Iterator>
+struct permuted_row_equality_comparator {
+  /**
+   * @brief Constructs a permuted comparator object which compares two rows of the table in given
+   * permutation order
+   *
+   * @param comparator Equality comparator
+   * @param permutation The permutation map that specifies the effective ordering of
+   * `t`. Must be the same size as `t.num_rows()`
+   */
+  permuted_row_equality_comparator(ComparatorT const& comparator, Iterator const permutation)
+    : _comparator{comparator}, _permutation{permutation}
+  {
+  }
+
+  /**
+   * @brief Returns true if the two rows at the specified indices in the permuted
+   * order are equivalent.
+   *
+   * For example, comparing rows `i` and `j` is equivalent to comparing
+   * rows `permutation[i]` and `permutation[j]` in the original table.
+   *
+   * @param lhs The index of the first row
+   * @param rhs The index of the second row
+   * @returns true if the two specified rows in the permuted order are equivalent
+   */
+  __device__ bool operator()(cudf::size_type lhs, cudf::size_type rhs) const
+  {
+    return _comparator(_permutation[lhs], _permutation[rhs]);
+  };
+
+ private:
+  ComparatorT const _comparator;
+  Iterator const _permutation;
+};
+}  // namespace cudf::groupby::detail
diff --git a/cpp/src/groupby/sort/functors.hpp b/cpp/src/groupby/sort/functors.hpp
index 748e34a583d..bcc190c745b 100644
--- a/cpp/src/groupby/sort/functors.hpp
+++ b/cpp/src/groupby/sort/functors.hpp
@@ -13,6 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
+#pragma once
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/detail/aggregation/result_cache.hpp>
diff --git a/cpp/src/groupby/sort/group_nunique.cu b/cpp/src/groupby/sort/group_nunique.cu
index b719698b6b5..c411e654913 100644
--- a/cpp/src/groupby/sort/group_nunique.cu
+++ b/cpp/src/groupby/sort/group_nunique.cu
@@ -16,7 +16,7 @@
 
 #include <cudf/aggregation.hpp>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/table/row_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -33,82 +33,45 @@ namespace groupby {
 namespace detail {
 namespace {
 
-template <typename T, typename Nullate>
+template <typename Nullate>
 struct is_unique_iterator_fn {
+  using comparator_type =
+    typename cudf::experimental::row::equality::device_row_comparator<Nullate>;
+
   Nullate nulls;
   column_device_view const v;
-  element_equality_comparator<Nullate> equal;
+  comparator_type equal;
   null_policy null_handling;
   size_type const* group_offsets;
   size_type const* group_labels;
 
   is_unique_iterator_fn(Nullate nulls,
                         column_device_view const& v,
+                        comparator_type const& equal,
                         null_policy null_handling,
                         size_type const* group_offsets,
                         size_type const* group_labels)
     : nulls{nulls},
       v{v},
-      equal{nulls, v, v},
+      equal{equal},
       null_handling{null_handling},
       group_offsets{group_offsets},
       group_labels{group_labels}
   {
   }
 
-  __device__ size_type operator()(size_type i)
+  __device__ size_type operator()(size_type i) const
   {
-    bool is_input_countable =
+    auto const is_input_countable =
       !nulls || (null_handling == null_policy::INCLUDE || v.is_valid_nocheck(i));
-    bool is_unique = is_input_countable &&
-                     (group_offsets[group_labels[i]] == i ||          // first element or
-                      (not equal.template operator()<T>(i, i - 1)));  // new unique value in sorted
+    auto const is_unique =
+      is_input_countable && (group_offsets[group_labels[i]] == i ||  // first element or
+                             (not equal(i, i - 1)));                 // new unique value in sorted
     return static_cast<size_type>(is_unique);
   }
 };
-
-struct nunique_functor {
-  template <typename T>
-  std::enable_if_t<cudf::is_equality_comparable<T, T>(), std::unique_ptr<column>> operator()(
-    column_view const& values,
-    cudf::device_span<size_type const> group_labels,
-    size_type const num_groups,
-    cudf::device_span<size_type const> group_offsets,
-    null_policy null_handling,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-    auto result = make_numeric_column(
-      data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
-
-    if (num_groups == 0) { return result; }
-
-    auto values_view        = column_device_view::create(values, stream);
-    auto is_unique_iterator = thrust::make_transform_iterator(
-      thrust::make_counting_iterator<size_type>(0),
-      is_unique_iterator_fn<T, nullate::DYNAMIC>{nullate::DYNAMIC{values.has_nulls()},
-                                                 *values_view,
-                                                 null_handling,
-                                                 group_offsets.data(),
-                                                 group_labels.data()});
-    thrust::reduce_by_key(rmm::exec_policy(stream),
-                          group_labels.begin(),
-                          group_labels.end(),
-                          is_unique_iterator,
-                          thrust::make_discard_iterator(),
-                          result->mutable_view().begin<size_type>());
-
-    return result;
-  }
-
-  template <typename T, typename... Args>
-  std::enable_if_t<!cudf::is_equality_comparable<T, T>(), std::unique_ptr<column>> operator()(
-    Args&&...)
-  {
-    CUDF_FAIL("list_view group_nunique not supported yet");
-  }
-};
 }  // namespace
+
 std::unique_ptr<column> group_nunique(column_view const& values,
                                       cudf::device_span<size_type const> group_labels,
                                       size_type const num_groups,
@@ -121,15 +84,33 @@ std::unique_ptr<column> group_nunique(column_view const& values,
   CUDF_EXPECTS(static_cast<size_t>(values.size()) == group_labels.size(),
                "Size of values column should be same as that of group labels");
 
-  return type_dispatcher(values.type(),
-                         nunique_functor{},
-                         values,
-                         group_labels,
-                         num_groups,
-                         group_offsets,
-                         null_handling,
-                         stream,
-                         mr);
+  auto result = make_numeric_column(
+    data_type(type_to_id<size_type>()), num_groups, mask_state::UNALLOCATED, stream, mr);
+
+  if (num_groups == 0) { return result; }
+
+  auto const values_view = table_view{{values}};
+  auto const comparator  = cudf::experimental::row::equality::self_comparator{values_view, stream};
+  auto const d_equal     = comparator.equal_to(
+    cudf::nullate::DYNAMIC{cudf::has_nested_nulls(values_view)}, null_equality::EQUAL);
+
+  auto const d_values_view = column_device_view::create(values, stream);
+  auto const is_unique_iterator =
+    thrust::make_transform_iterator(thrust::counting_iterator<cudf::size_type>(0),
+                                    is_unique_iterator_fn{nullate::DYNAMIC{values.has_nulls()},
+                                                          *d_values_view,
+                                                          d_equal,
+                                                          null_handling,
+                                                          group_offsets.data(),
+                                                          group_labels.data()});
+  thrust::reduce_by_key(rmm::exec_policy(stream),
+                        group_labels.begin(),
+                        group_labels.end(),
+                        is_unique_iterator,
+                        thrust::make_discard_iterator(),
+                        result->mutable_view().begin<size_type>());
+
+  return result;
 }
 
 }  // namespace detail
diff --git a/cpp/src/groupby/sort/group_rank_scan.cu b/cpp/src/groupby/sort/group_rank_scan.cu
index cce84384ef7..149f026ffe6 100644
--- a/cpp/src/groupby/sort/group_rank_scan.cu
+++ b/cpp/src/groupby/sort/group_rank_scan.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "common_utils.cuh"
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/aggregation/aggregation.hpp>
@@ -21,7 +23,7 @@
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/device_operators.cuh>
-#include <cudf/table/row_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
@@ -39,36 +41,6 @@ namespace groupby {
 namespace detail {
 namespace {
 
-/**
- * @brief Functor to compare two rows of a table in given permutation order
- * This is useful to identify unique elements in a sorted order table, when the permutation order is
- * the sorted order of the table.
- *
- */
-template <typename Iterator>
-struct permuted_comparator {
-  /**
-   * @brief comparator object which compares two rows of the table in given permutation order
-   *
-   * @param device_table Device table to compare
-   * @param permutation The permutation order, integer type column.
-   * @param has_nulls whether the table has nulls
-   */
-  permuted_comparator(table_device_view device_table, Iterator const permutation, bool has_nulls)
-    : comparator(nullate::DYNAMIC{has_nulls}, device_table, device_table, null_equality::EQUAL),
-      permutation(permutation)
-  {
-  }
-  __device__ bool operator()(size_type index1, size_type index2) const
-  {
-    return comparator(permutation[index1], permutation[index2]);
-  };
-
- private:
-  row_equality_comparator<nullate::DYNAMIC> comparator;
-  Iterator const permutation;
-};
-
 /**
  * @brief generate grouped row ranks or dense ranks using a row comparison then scan the results
  *
@@ -99,32 +71,29 @@ std::unique_ptr<column> rank_generator(column_view const& grouped_values,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  auto const flattened = cudf::structs::detail::flatten_nested_columns(
-    table_view{{grouped_values}}, {}, {}, structs::detail::column_nullability::MATCH_INCOMING);
-  auto const d_flat_order = table_device_view::create(flattened, stream);
-  auto sorted_index_order = value_order.begin<size_type>();
-  auto comparator         = permuted_comparator(*d_flat_order, sorted_index_order, has_nulls);
+  auto const comparator =
+    cudf::experimental::row::equality::self_comparator{table_view{{grouped_values}}, stream};
+  auto const d_equal = comparator.equal_to(cudf::nullate::DYNAMIC{has_nulls}, null_equality::EQUAL);
+  auto const permuted_equal =
+    permuted_row_equality_comparator(d_equal, value_order.begin<size_type>());
 
-  auto ranks         = make_fixed_width_column(data_type{type_to_id<size_type>()},
-                                       flattened.flattened_columns().num_rows(),
-                                       mask_state::UNALLOCATED,
-                                       stream,
-                                       mr);
+  auto ranks = make_fixed_width_column(
+    data_type{type_to_id<size_type>()}, grouped_values.size(), mask_state::UNALLOCATED, stream, mr);
   auto mutable_ranks = ranks->mutable_view();
 
   auto unique_identifier = [labels  = group_labels.begin(),
                             offsets = group_offsets.begin(),
-                            comparator,
+                            permuted_equal,
                             resolver] __device__(size_type row_index) {
     auto const group_start = offsets[labels[row_index]];
     if constexpr (forward) {
       // First value of equal values is 1.
-      return resolver(row_index == group_start || !comparator(row_index, row_index - 1),
+      return resolver(row_index == group_start || !permuted_equal(row_index, row_index - 1),
                       row_index - group_start);
     } else {
       auto const group_end = offsets[labels[row_index] + 1];
       // Last value of equal values is 1.
-      return resolver(row_index + 1 == group_end || !comparator(row_index, row_index + 1),
+      return resolver(row_index + 1 == group_end || !permuted_equal(row_index, row_index + 1),
                       row_index - group_start);
     }
   };
diff --git a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
index 93d5e6c032c..58ee06fcfef 100644
--- a/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
+++ b/cpp/src/groupby/sort/group_single_pass_reduction_util.cuh
@@ -25,7 +25,6 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/element_argminmax.cuh>
 #include <cudf/detail/valid_if.cuh>
-#include <cudf/table/row_operators.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
diff --git a/cpp/src/groupby/sort/scan.cpp b/cpp/src/groupby/sort/scan.cpp
index 5d345273782..743ca5e8065 100644
--- a/cpp/src/groupby/sort/scan.cpp
+++ b/cpp/src/groupby/sort/scan.cpp
@@ -178,9 +178,9 @@ void scan_result_functor::operator()<aggregation::RANK>(aggregation const& agg)
                                               stream,
                                               mr);
   }
-  result = std::move(cudf::detail::scatter(
-                       table_view{{*result}}, *gather_map, table_view{{*result}}, false, stream, mr)
-                       ->release()[0]);
+  result = std::move(
+    cudf::detail::scatter(table_view{{*result}}, *gather_map, table_view{{*result}}, stream, mr)
+      ->release()[0]);
   if (rank_agg._null_handling == null_policy::EXCLUDE) {
     result->set_null_mask(cudf::detail::copy_bitmask(get_grouped_values(), stream, mr));
   }
diff --git a/cpp/src/groupby/sort/sort_helper.cu b/cpp/src/groupby/sort/sort_helper.cu
index a0abaf71160..2bf63cb42fc 100644
--- a/cpp/src/groupby/sort/sort_helper.cu
+++ b/cpp/src/groupby/sort/sort_helper.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include "common_utils.cuh"
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
@@ -26,7 +28,7 @@
 #include <cudf/detail/sorting.hpp>
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/strings/string_view.hpp>
-#include <cudf/table/row_operators.cuh>
+#include <cudf/table/experimental/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/traits.hpp>
 
@@ -44,48 +46,6 @@
 #include <numeric>
 #include <tuple>
 
-namespace {
-/**
- * @brief Compares two `table` rows for equality as if the table were
- * ordered according to a specified permutation map.
- */
-struct permuted_row_equality_comparator {
-  cudf::row_equality_comparator<cudf::nullate::DYNAMIC> _comparator;
-  cudf::size_type const* _map;
-
-  /**
-   * @brief Construct a permuted_row_equality_comparator.
-   *
-   * @param t The `table` whose rows will be compared
-   * @param map The permutation map that specifies the effective ordering of
-   * `t`. Must be the same size as `t.num_rows()`
-   */
-  permuted_row_equality_comparator(cudf::table_device_view const& t,
-                                   cudf::size_type const* map,
-                                   bool nullable = true)
-    : _comparator(cudf::nullate::DYNAMIC{nullable}, t, t, cudf::null_equality::EQUAL), _map{map}
-  {
-  }
-
-  /**
-   * @brief Returns true if the two rows at the specified indices in the permuted
-   * order are equivalent.
-   *
-   * For example, comparing rows `i` and `j` is
-   * equivalent to comparing rows `map[i]` and `map[j]` in the original table.
-   *
-   * @param lhs The index of the first row
-   * @param rhs The index of the second row
-   * @returns true if the two specified rows in the permuted order are equivalent
-   */
-  __device__ inline bool operator()(cudf::size_type lhs, cudf::size_type rhs)
-  {
-    return _comparator(_map[lhs], _map[rhs]);
-  }
-};
-
-}  // namespace
-
 namespace cudf {
 namespace groupby {
 namespace detail {
@@ -94,19 +54,13 @@ namespace sort {
 sort_groupby_helper::sort_groupby_helper(table_view const& keys,
                                          null_policy include_null_keys,
                                          sorted keys_pre_sorted)
-  : _unflattened_keys(keys),
+  : _keys(keys),
     _num_keys(-1),
     _keys_pre_sorted(keys_pre_sorted),
     _include_null_keys(include_null_keys)
 {
   using namespace cudf::structs::detail;
 
-  _flattened                 = flatten_nested_columns(keys, {}, {}, column_nullability::FORCE);
-  _keys                      = _flattened;
-  auto is_supported_key_type = [](auto col) { return cudf::is_equality_comparable(col.type()); };
-  CUDF_EXPECTS(std::all_of(_keys.begin(), _keys.end(), is_supported_key_type),
-               "Unsupported groupby key type does not support equality comparison");
-
   // Cannot depend on caller's sorting if the column contains nulls,
   // and null values are to be excluded.
   // Re-sort the data, to filter out nulls more easily.
@@ -191,16 +145,17 @@ sort_groupby_helper::index_vector const& sort_groupby_helper::group_offsets(
 
   _group_offsets = std::make_unique<index_vector>(num_keys(stream) + 1, stream);
 
-  auto device_input_table = table_device_view::create(_keys, stream);
-  auto sorted_order       = key_sort_order(stream).data<size_type>();
+  auto const comparator  = cudf::experimental::row::equality::self_comparator{_keys, stream};
+  auto const d_key_equal = comparator.equal_to(
+    cudf::nullate::DYNAMIC{cudf::has_nested_nulls(_keys)}, null_equality::EQUAL);
+  auto const sorted_order = key_sort_order(stream).data<size_type>();
   decltype(_group_offsets->begin()) result_end;
 
-  result_end = thrust::unique_copy(
-    rmm::exec_policy(stream),
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(num_keys(stream)),
-    _group_offsets->begin(),
-    permuted_row_equality_comparator(*device_input_table, sorted_order, has_nulls(_keys)));
+  result_end = thrust::unique_copy(rmm::exec_policy(stream),
+                                   thrust::counting_iterator<size_type>(0),
+                                   thrust::counting_iterator<size_type>(num_keys(stream)),
+                                   _group_offsets->begin(),
+                                   permuted_row_equality_comparator(d_key_equal, sorted_order));
 
   size_type num_groups = thrust::distance(_group_offsets->begin(), result_end);
   _group_offsets->set_element(num_groups, num_keys(stream), stream);
@@ -244,7 +199,6 @@ column_view sort_groupby_helper::unsorted_keys_labels(rmm::cuda_stream_view stre
     cudf::detail::scatter(table_view({group_labels_view}),
                           scatter_map,
                           table_view({temp_labels->view()}),
-                          false,
                           stream,
                           rmm::mr::get_current_device_resource());
 
@@ -316,7 +270,7 @@ std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view st
   auto gather_map_it = thrust::make_transform_iterator(
     group_offsets(stream).begin(), [idx_data] __device__(size_type i) { return idx_data[i]; });
 
-  return cudf::detail::gather(_unflattened_keys,
+  return cudf::detail::gather(_keys,
                               gather_map_it,
                               gather_map_it + num_groups(stream),
                               out_of_bounds_policy::DONT_CHECK,
@@ -327,7 +281,7 @@ std::unique_ptr<table> sort_groupby_helper::unique_keys(rmm::cuda_stream_view st
 std::unique_ptr<table> sort_groupby_helper::sorted_keys(rmm::cuda_stream_view stream,
                                                         rmm::mr::device_memory_resource* mr)
 {
-  return cudf::detail::gather(_unflattened_keys,
+  return cudf::detail::gather(_keys,
                               key_sort_order(stream),
                               cudf::out_of_bounds_policy::DONT_CHECK,
                               cudf::detail::negative_index_policy::NOT_ALLOWED,
diff --git a/cpp/src/hash/concurrent_unordered_map.cuh b/cpp/src/hash/concurrent_unordered_map.cuh
index c2081c596a1..f99aabc56bf 100644
--- a/cpp/src/hash/concurrent_unordered_map.cuh
+++ b/cpp/src/hash/concurrent_unordered_map.cuh
@@ -159,7 +159,7 @@ class concurrent_unordered_map {
    * storage
    */
   static auto create(size_type capacity,
-                     rmm::cuda_stream_view stream     = cudf::default_stream_value,
+                     rmm::cuda_stream_view stream,
                      const mapped_type unused_element = std::numeric_limits<mapped_type>::max(),
                      const key_type unused_key        = std::numeric_limits<key_type>::max(),
                      const Hasher& hash_function      = hasher(),
@@ -421,8 +421,7 @@ class concurrent_unordered_map {
     }
   }
 
-  void assign_async(const concurrent_unordered_map& other,
-                    rmm::cuda_stream_view stream = cudf::default_stream_value)
+  void assign_async(const concurrent_unordered_map& other, rmm::cuda_stream_view stream)
   {
     if (other.m_capacity <= m_capacity) {
       m_capacity = other.m_capacity;
@@ -440,7 +439,7 @@ class concurrent_unordered_map {
                                   stream.value()));
   }
 
-  void clear_async(rmm::cuda_stream_view stream = cudf::default_stream_value)
+  void clear_async(rmm::cuda_stream_view stream)
   {
     constexpr int block_size = 128;
     init_hashtbl<<<((m_capacity - 1) / block_size) + 1, block_size, 0, stream.value()>>>(
@@ -455,7 +454,7 @@ class concurrent_unordered_map {
     }
   }
 
-  void prefetch(const int dev_id, rmm::cuda_stream_view stream = cudf::default_stream_value)
+  void prefetch(const int dev_id, rmm::cuda_stream_view stream)
   {
     cudaPointerAttributes hashtbl_values_ptr_attributes;
     cudaError_t status = cudaPointerGetAttributes(&hashtbl_values_ptr_attributes, m_hashtbl_values);
@@ -475,7 +474,7 @@ class concurrent_unordered_map {
    *
    * @param stream CUDA stream used for device memory operations and kernel launches.
    */
-  void destroy(rmm::cuda_stream_view stream = cudf::default_stream_value)
+  void destroy(rmm::cuda_stream_view stream)
   {
     m_allocator.deallocate(m_hashtbl_values, m_capacity, stream);
     delete this;
@@ -516,7 +515,7 @@ class concurrent_unordered_map {
                            const Hasher& hash_function,
                            const Equality& equal,
                            const allocator_type& allocator,
-                           rmm::cuda_stream_view stream = cudf::default_stream_value)
+                           rmm::cuda_stream_view stream)
     : m_hf(hash_function),
       m_equal(equal),
       m_allocator(allocator),
diff --git a/cpp/src/hash/hash_allocator.cuh b/cpp/src/hash/hash_allocator.cuh
index 2da0a4fb4bd..207f46ae543 100644
--- a/cpp/src/hash/hash_allocator.cuh
+++ b/cpp/src/hash/hash_allocator.cuh
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef HASH_ALLOCATOR_CUH
-#define HASH_ALLOCATOR_CUH
+#pragma once
 
 #include <new>
 
@@ -26,42 +25,6 @@
 #include <rmm/mr/device/managed_memory_resource.hpp>
 #include <rmm/mr/device/per_device_resource.hpp>
 
-template <class T>
-struct managed_allocator {
-  using value_type                    = T;
-  rmm::mr::device_memory_resource* mr = new rmm::mr::managed_memory_resource;
-
-  managed_allocator() = default;
-
-  template <class U>
-  constexpr managed_allocator(const managed_allocator<U>&) noexcept
-  {
-  }
-
-  T* allocate(std::size_t n, rmm::cuda_stream_view stream = cudf::default_stream_value) const
-  {
-    return static_cast<T*>(mr->allocate(n * sizeof(T), stream));
-  }
-
-  void deallocate(T* p,
-                  std::size_t n,
-                  rmm::cuda_stream_view stream = cudf::default_stream_value) const
-  {
-    mr->deallocate(p, n * sizeof(T), stream);
-  }
-};
-
-template <class T, class U>
-bool operator==(const managed_allocator<T>&, const managed_allocator<U>&)
-{
-  return true;
-}
-template <class T, class U>
-bool operator!=(const managed_allocator<T>&, const managed_allocator<U>&)
-{
-  return false;
-}
-
 template <class T>
 struct default_allocator {
   using value_type                    = T;
@@ -74,14 +37,14 @@ struct default_allocator {
   {
   }
 
-  T* allocate(std::size_t n, rmm::cuda_stream_view stream = cudf::default_stream_value) const
+  T* allocate(std::size_t n, rmm::cuda_stream_view stream = cudf::get_default_stream()) const
   {
     return static_cast<T*>(mr->allocate(n * sizeof(T), stream));
   }
 
   void deallocate(T* p,
                   std::size_t n,
-                  rmm::cuda_stream_view stream = cudf::default_stream_value) const
+                  rmm::cuda_stream_view stream = cudf::get_default_stream()) const
   {
     mr->deallocate(p, n * sizeof(T), stream);
   }
@@ -97,5 +60,3 @@ bool operator!=(const default_allocator<T>&, const default_allocator<U>&)
 {
   return false;
 }
-
-#endif
diff --git a/cpp/src/hash/hashing.cu b/cpp/src/hash/hashing.cu
index e5fac1e7c2c..150017d9117 100644
--- a/cpp/src/hash/hashing.cu
+++ b/cpp/src/hash/hashing.cu
@@ -74,7 +74,7 @@ std::unique_ptr<column> hash(table_view const& input,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hash(input, hash_function, seed, cudf::default_stream_value, mr);
+  return detail::hash(input, hash_function, seed, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/hash/helper_functions.cuh b/cpp/src/hash/helper_functions.cuh
index 3b8d8528894..70fc47538c9 100644
--- a/cpp/src/hash/helper_functions.cuh
+++ b/cpp/src/hash/helper_functions.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef HELPER_FUNCTIONS_CUH
-#define HELPER_FUNCTIONS_CUH
+#pragma once
 
 #include <cudf/types.hpp>
 
@@ -242,5 +241,3 @@ __host__ __device__ bool operator!=(const cycle_iterator_adapter<T>& lhs,
 {
   return !lhs.equal(rhs);
 }
-
-#endif  // HELPER_FUNCTIONS_CUH
diff --git a/cpp/src/hash/managed.cuh b/cpp/src/hash/managed.cuh
index c5aab78589e..d85a12c69a9 100644
--- a/cpp/src/hash/managed.cuh
+++ b/cpp/src/hash/managed.cuh
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017, NVIDIA CORPORATION.
+ * Copyright (c) 2017-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef MANAGED_CUH
-#define MANAGED_CUH
+#pragma once
 
 #include <new>
 
@@ -43,5 +42,3 @@ inline bool isPtrManaged(cudaPointerAttributes attr)
   return attr.isManaged;
 #endif
 }
-
-#endif  // MANAGED_CUH
diff --git a/cpp/src/interop/dlpack.cpp b/cpp/src/interop/dlpack.cpp
index 7b300924dd5..58afc8e9015 100644
--- a/cpp/src/interop/dlpack.cpp
+++ b/cpp/src/interop/dlpack.cpp
@@ -299,13 +299,13 @@ std::unique_ptr<table> from_dlpack(DLManagedTensor const* managed_tensor,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_dlpack(managed_tensor, cudf::default_stream_value, mr);
+  return detail::from_dlpack(managed_tensor, cudf::get_default_stream(), mr);
 }
 
 DLManagedTensor* to_dlpack(table_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_dlpack(input, cudf::default_stream_value, mr);
+  return detail::to_dlpack(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/interop/from_arrow.cu b/cpp/src/interop/from_arrow.cu
index 86ea6f4427e..2d4501ec9f7 100644
--- a/cpp/src/interop/from_arrow.cu
+++ b/cpp/src/interop/from_arrow.cu
@@ -450,7 +450,7 @@ std::unique_ptr<table> from_arrow(arrow::Table const& input_table,
 {
   CUDF_FUNC_RANGE();
 
-  return detail::from_arrow(input_table, cudf::default_stream_value, mr);
+  return detail::from_arrow(input_table, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/interop/to_arrow.cu b/cpp/src/interop/to_arrow.cu
index eeb27c2ac05..fb203e6c3c1 100644
--- a/cpp/src/interop/to_arrow.cu
+++ b/cpp/src/interop/to_arrow.cu
@@ -154,7 +154,7 @@ std::shared_ptr<arrow::Array> dispatch_to_arrow::operator()<numeric::decimal64>(
 
   auto count = thrust::make_counting_iterator(0);
 
-  thrust::for_each(rmm::exec_policy(cudf::default_stream_value),
+  thrust::for_each(rmm::exec_policy(cudf::get_default_stream()),
                    count,
                    count + input.size(),
                    [in = input.begin<DeviceType>(), out = buf.data()] __device__(auto in_idx) {
@@ -416,7 +416,7 @@ std::shared_ptr<arrow::Table> to_arrow(table_view input,
                                        arrow::MemoryPool* ar_mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_arrow(input, metadata, cudf::default_stream_value, ar_mr);
+  return detail::to_arrow(input, metadata, cudf::get_default_stream(), ar_mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/io/comp/nvcomp_adapter.cpp b/cpp/src/io/comp/nvcomp_adapter.cpp
index fd794b2e66c..fd0cbeced3a 100644
--- a/cpp/src/io/comp/nvcomp_adapter.cpp
+++ b/cpp/src/io/comp/nvcomp_adapter.cpp
@@ -31,46 +31,23 @@
 #include NVCOMP_ZSTD_HEADER
 #endif
 
-#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 3)
-#define NVCOMP_HAS_ZSTD_DECOMP 1
-#else
-#define NVCOMP_HAS_ZSTD_DECOMP 0
-#endif
+#define NVCOMP_HAS_ZSTD_DECOMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 3))
 
-#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 4)
-#define NVCOMP_HAS_ZSTD_COMP 1
-#else
-#define NVCOMP_HAS_ZSTD_COMP 0
-#endif
+#define NVCOMP_HAS_ZSTD_COMP(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 4))
 
-#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION >= 3)
-#define NVCOMP_HAS_DEFLATE 1
-#else
-#define NVCOMP_HAS_DEFLATE 0
-#endif
+#define NVCOMP_HAS_DEFLATE(MAJOR, MINOR, PATCH) (MAJOR > 2 or (MAJOR == 2 and MINOR >= 5))
 
-#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION > 3) or \
-  (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION == 3 and NVCOMP_PATCH_VERSION >= 1)
-#define NVCOMP_HAS_TEMPSIZE_EX 1
-#else
-#define NVCOMP_HAS_TEMPSIZE_EX 0
-#endif
+#define NVCOMP_HAS_TEMPSIZE_EX(MAJOR, MINOR, PATCH) \
+  (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 1))
 
 // ZSTD is stable for nvcomp 2.3.2 or newer
-#if NVCOMP_MAJOR_VERSION > 2 or (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION > 3) or \
-  (NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION == 3 and NVCOMP_PATCH_VERSION >= 2)
-#define NVCOMP_ZSTD_IS_STABLE 1
-#else
-#define NVCOMP_ZSTD_IS_STABLE 0
-#endif
+#define NVCOMP_ZSTD_DECOMP_IS_STABLE(MAJOR, MINOR, PATCH) \
+  (MAJOR > 2 or (MAJOR == 2 and MINOR > 3) or (MAJOR == 2 and MINOR == 3 and PATCH >= 2))
 
 // Issue https://github.com/NVIDIA/spark-rapids/issues/6614 impacts nvCOMP 2.4.0 ZSTD decompression
 // on compute 6.x
-#if NVCOMP_MAJOR_VERSION == 2 and NVCOMP_MINOR_VERSION == 4 and NVCOMP_PATCH_VERSION == 0
-#define NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL 1
-#else
-#define NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL 0
-#endif
+#define NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL(MAJOR, MINOR, PATCH) \
+  (MAJOR == 2 and MINOR == 4 and PATCH == 0)
 
 namespace cudf::io::nvcomp {
 
@@ -79,12 +56,12 @@ template <typename... Args>
 std::optional<nvcompStatus_t> batched_decompress_get_temp_size_ex(compression_type compression,
                                                                   Args&&... args)
 {
-#if NVCOMP_HAS_TEMPSIZE_EX
+#if NVCOMP_HAS_TEMPSIZE_EX(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
   switch (compression) {
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressGetTempSizeEx(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP
+#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedZstdDecompressGetTempSizeEx(std::forward<Args>(args)...);
 #else
       return std::nullopt;
@@ -104,16 +81,18 @@ auto batched_decompress_get_temp_size(compression_type compression, Args&&... ar
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressGetTempSize(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP
+#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedZstdDecompressGetTempSize(std::forward<Args>(args)...);
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Decompression error: " +
+                nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE
+#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedDeflateDecompressGetTempSize(std::forward<Args>(args)...);
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Decompression error: " +
+                nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value());
 #endif
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -127,16 +106,18 @@ auto batched_decompress_async(compression_type compression, Args&&... args)
     case compression_type::SNAPPY:
       return nvcompBatchedSnappyDecompressAsync(std::forward<Args>(args)...);
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_DECOMP
+#if NVCOMP_HAS_ZSTD_DECOMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedZstdDecompressAsync(std::forward<Args>(args)...);
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Decompression error: " +
+                nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE
+#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompBatchedDeflateDecompressAsync(std::forward<Args>(args)...);
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Decompression error: " +
+                nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE).value());
 #endif
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -163,22 +144,6 @@ size_t batched_decompress_temp_size(compression_type compression,
   return temp_size;
 }
 
-void check_is_zstd_enabled()
-{
-  CUDF_EXPECTS(NVCOMP_HAS_ZSTD_DECOMP, "nvCOMP 2.3 or newer is required for Zstandard compression");
-  CUDF_EXPECTS(NVCOMP_ZSTD_IS_STABLE or cudf::io::detail::nvcomp_integration::is_all_enabled(),
-               "Zstandard compression is experimental, you can enable it through "
-               "`LIBCUDF_NVCOMP_POLICY` environment variable.");
-
-#if NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL
-  int device;
-  int cc_major;
-  CUDF_CUDA_TRY(cudaGetDevice(&device));
-  CUDF_CUDA_TRY(cudaDeviceGetAttribute(&cc_major, cudaDevAttrComputeCapabilityMajor, device));
-  CUDF_EXPECTS(cc_major != 6, "Zstandard decompression is disabled on Pascal GPUs");
-#endif
-}
-
 void batched_decompress(compression_type compression,
                         device_span<device_span<uint8_t const> const> inputs,
                         device_span<device_span<uint8_t> const> outputs,
@@ -187,8 +152,6 @@ void batched_decompress(compression_type compression,
                         size_t max_total_uncomp_size,
                         rmm::cuda_stream_view stream)
 {
-  if (compression == compression_type::ZSTD) { check_is_zstd_enabled(); }
-
   auto const num_chunks = inputs.size();
 
   // cuDF inflate inputs converted to nvcomp inputs
@@ -228,20 +191,22 @@ auto batched_compress_temp_size(compression_type compression,
         batch_size, max_uncompressed_chunk_bytes, nvcompBatchedSnappyDefaultOpts, &temp_size);
       break;
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE
+#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedDeflateCompressGetTempSize(
         batch_size, max_uncompressed_chunk_bytes, nvcompBatchedDeflateDefaultOpts, &temp_size);
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
 #endif
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP
+#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedZstdCompressGetTempSize(
         batch_size, max_uncompressed_chunk_bytes, nvcompBatchedZstdDefaultOpts, &temp_size);
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -266,20 +231,22 @@ size_t compress_max_output_chunk_size(compression_type compression,
         capped_uncomp_bytes, nvcompBatchedSnappyDefaultOpts, &max_comp_chunk_size);
       break;
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE
+#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       status = nvcompBatchedDeflateCompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedDeflateDefaultOpts, &max_comp_chunk_size);
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
 #endif
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP
+#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       status = nvcompBatchedZstdCompressGetMaxOutputChunkSize(
         capped_uncomp_bytes, nvcompBatchedZstdDefaultOpts, &max_comp_chunk_size);
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -316,7 +283,7 @@ static void batched_compress_async(compression_type compression,
                                                        stream.value());
       break;
     case compression_type::DEFLATE:
-#if NVCOMP_HAS_DEFLATE
+#if NVCOMP_HAS_DEFLATE(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedDeflateCompressAsync(device_uncompressed_ptrs,
                                                         device_uncompressed_bytes,
                                                         max_uncompressed_chunk_bytes,
@@ -329,10 +296,11 @@ static void batched_compress_async(compression_type compression,
                                                         stream.value());
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE).value());
 #endif
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP
+#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       nvcomp_status = nvcompBatchedZstdCompressAsync(device_uncompressed_ptrs,
                                                      device_uncompressed_bytes,
                                                      max_uncompressed_chunk_bytes,
@@ -345,7 +313,8 @@ static void batched_compress_async(compression_type compression,
                                                      stream.value());
       break;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     default: CUDF_FAIL("Unsupported compression type");
   }
@@ -390,18 +359,109 @@ void batched_compress(compression_type compression,
   update_compression_results(actual_compressed_data_sizes, results, stream);
 }
 
-bool is_compression_enabled(compression_type compression)
+feature_status_parameters::feature_status_parameters()
+  : lib_major_version{NVCOMP_MAJOR_VERSION},
+    lib_minor_version{NVCOMP_MINOR_VERSION},
+    lib_patch_version{NVCOMP_PATCH_VERSION},
+    are_all_integrations_enabled{detail::nvcomp_integration::is_all_enabled()},
+    are_stable_integrations_enabled{detail::nvcomp_integration::is_stable_enabled()}
+{
+  int device;
+  CUDF_CUDA_TRY(cudaGetDevice(&device));
+  CUDF_CUDA_TRY(
+    cudaDeviceGetAttribute(&compute_capability_major, cudaDevAttrComputeCapabilityMajor, device));
+}
+
+std::optional<std::string> is_compression_disabled(compression_type compression,
+                                                   feature_status_parameters params)
 {
   switch (compression) {
-    case compression_type::DEFLATE:
-      // See https://github.com/rapidsai/cudf/issues/11812
-      return false;
-    case compression_type::SNAPPY: return detail::nvcomp_integration::is_stable_enabled();
-    case compression_type::ZSTD:
-      return NVCOMP_HAS_ZSTD_COMP and detail::nvcomp_integration::is_all_enabled();
-    default: return false;
+    case compression_type::DEFLATE: {
+      if (not NVCOMP_HAS_DEFLATE(
+            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
+        return "nvCOMP 2.5 or newer is required for Deflate compression";
+      }
+      if (not params.are_all_integrations_enabled) {
+        return "DEFLATE compression is experimental, you can enable it through "
+               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::SNAPPY: {
+      if (not params.are_stable_integrations_enabled) {
+        return "Snappy compression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
+               "environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::ZSTD: {
+      if (not NVCOMP_HAS_ZSTD_COMP(
+            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
+        return "nvCOMP 2.4 or newer is required for Zstandard compression";
+      }
+      if (not params.are_stable_integrations_enabled) {
+        return "Zstandard compression is experimental, you can enable it through "
+               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+    default: return "Unsupported compression type";
+  }
+  return "Unsupported compression type";
+}
+
+std::optional<std::string> is_zstd_decomp_disabled(feature_status_parameters const& params)
+{
+  if (not NVCOMP_HAS_ZSTD_DECOMP(
+        params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
+    return "nvCOMP 2.3 or newer is required for Zstandard decompression";
+  }
+
+  if (NVCOMP_ZSTD_DECOMP_IS_STABLE(
+        params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
+    if (not params.are_stable_integrations_enabled) {
+      return "Zstandard decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
+             "environment variable.";
+    }
+  } else if (not params.are_all_integrations_enabled) {
+    return "Zstandard decompression is experimental, you can enable it through "
+           "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+  }
+
+  if (NVCOMP_ZSTD_IS_DISABLED_ON_PASCAL(
+        params.lib_major_version, params.lib_minor_version, params.lib_patch_version) and
+      params.compute_capability_major == 6) {
+    return "Zstandard decompression is disabled on Pascal GPUs";
+  }
+  return std::nullopt;
+}
+
+std::optional<std::string> is_decompression_disabled(compression_type compression,
+                                                     feature_status_parameters params)
+{
+  switch (compression) {
+    case compression_type::DEFLATE: {
+      if (not NVCOMP_HAS_DEFLATE(
+            params.lib_major_version, params.lib_minor_version, params.lib_patch_version)) {
+        return "nvCOMP 2.5 or newer is required for Deflate decompression";
+      }
+      if (not params.are_all_integrations_enabled) {
+        return "DEFLATE decompression is experimental, you can enable it through "
+               "`LIBCUDF_NVCOMP_POLICY` environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::SNAPPY: {
+      if (not params.are_stable_integrations_enabled) {
+        return "Snappy decompression has been disabled through the `LIBCUDF_NVCOMP_POLICY` "
+               "environment variable.";
+      }
+      return std::nullopt;
+    }
+    case compression_type::ZSTD: return is_zstd_decomp_disabled(params);
+    default: return "Unsupported compression type";
   }
-  return false;
+  return "Unsupported compression type";
 }
 
 size_t compress_input_alignment_bits(compression_type compression)
@@ -430,10 +490,11 @@ std::optional<size_t> compress_max_allowed_chunk_size(compression_type compressi
     case compression_type::DEFLATE: return 64 * 1024;
     case compression_type::SNAPPY: return std::nullopt;
     case compression_type::ZSTD:
-#if NVCOMP_HAS_ZSTD_COMP
+#if NVCOMP_HAS_ZSTD_COMP(NVCOMP_MAJOR_VERSION, NVCOMP_MINOR_VERSION, NVCOMP_PATCH_VERSION)
       return nvcompZstdCompressionMaxAllowedChunkSize;
 #else
-      CUDF_FAIL("Unsupported compression type");
+      CUDF_FAIL("Compression error: " +
+                nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD).value());
 #endif
     default: return std::nullopt;
   }
diff --git a/cpp/src/io/comp/nvcomp_adapter.hpp b/cpp/src/io/comp/nvcomp_adapter.hpp
index a13cb031163..a6bde7957c7 100644
--- a/cpp/src/io/comp/nvcomp_adapter.hpp
+++ b/cpp/src/io/comp/nvcomp_adapter.hpp
@@ -18,6 +18,8 @@
 
 #include "gpuinflate.hpp"
 
+#include <io/utilities/config_utils.hpp>
+
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
 
@@ -30,14 +32,52 @@ namespace cudf::io::nvcomp {
 enum class compression_type { SNAPPY, ZSTD, DEFLATE };
 
 /**
- * @brief Whether the given compression type is enabled through nvCOMP.
+ * @brief Set of parameters that impact whether the use nvCOMP features is enabled.
+ */
+struct feature_status_parameters {
+  int lib_major_version;
+  int lib_minor_version;
+  int lib_patch_version;
+  bool are_all_integrations_enabled;
+  bool are_stable_integrations_enabled;
+  int compute_capability_major;
+
+  feature_status_parameters();
+  feature_status_parameters(
+    int major, int minor, int patch, bool all_enabled, bool stable_enabled, int cc_major)
+    : lib_major_version{major},
+      lib_minor_version{minor},
+      lib_patch_version{patch},
+      are_all_integrations_enabled{all_enabled},
+      are_stable_integrations_enabled{stable_enabled},
+      compute_capability_major{cc_major}
+  {
+  }
+};
+
+/**
+ * @brief If a compression type is disabled through nvCOMP, returns the reason as a string.
+ *
+ * Result cab depend on nvCOMP version and environment variables.
+ *
+ * @param compression Compression type
+ * @param params Optional parameters to query status with different configurations
+ * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
+ */
+[[nodiscard]] std::optional<std::string> is_compression_disabled(
+  compression_type compression, feature_status_parameters params = feature_status_parameters());
+
+/**
+ * @brief If a decompression type is disabled through nvCOMP, returns the reason as a string.
  *
- * Result depends on nvCOMP version and environment variables.
+ * Result can depend on nvCOMP version and environment variables.
  *
  * @param compression Compression type
- * @returns true if nvCOMP use is enabled; false otherwise
+ * @param params Optional parameters to query status with different configurations
+ * @returns Reason for the feature disablement, `std::nullopt` if the feature is enabled
  */
-[[nodiscard]] bool is_compression_enabled(compression_type compression);
+[[nodiscard]] std::optional<std::string> is_decompression_disabled(
+  compression_type compression, feature_status_parameters params = feature_status_parameters());
 
 /**
  * @brief Device batch decompression of given type.
diff --git a/cpp/src/io/csv/reader_impl.cu b/cpp/src/io/csv/reader_impl.cu
index d669dea3115..075e9e2c965 100644
--- a/cpp/src/io/csv/reader_impl.cu
+++ b/cpp/src/io/csv/reader_impl.cu
@@ -134,7 +134,6 @@ std::vector<std::string> get_column_names(std::vector<char> const& header,
   if (header.size() <= 1) { return col_names; }
 
   std::vector<char> first_row = header;
-  int num_cols                = 0;
 
   bool quotation = false;
   for (size_t pos = 0, prev = 0; pos < first_row.size(); ++pos) {
@@ -163,17 +162,16 @@ std::vector<std::string> get_column_names(std::vector<char> const& header,
 
         const string new_col_name(first_row.data() + prev, col_name_len);
         col_names.push_back(removeQuotes(new_col_name, parse_opts.quotechar));
-
-        // Stop parsing when we hit the line terminator; relevant when there is
-        // a blank line following the header. In this case, first_row includes
-        // multiple line terminators at the end, as the new recStart belongs to
-        // a line that comes after the blank line(s)
-        if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
       } else {
         // This is the first data row, add the automatically generated name
-        col_names.push_back(prefix + std::to_string(num_cols));
+        col_names.push_back(prefix + std::to_string(col_names.size()));
       }
-      num_cols++;
+
+      // Stop parsing when we hit the line terminator; relevant when there is
+      // a blank line following the header. In this case, first_row includes
+      // multiple line terminators at the end, as the new recStart belongs to
+      // a line that comes after the blank line(s)
+      if (!quotation && first_row[pos] == parse_opts.terminator) { break; }
 
       // Skip adjacent delimiters if delim_whitespace is set
       while (parse_opts.multi_delimiter && pos < first_row.size() &&
@@ -540,8 +538,7 @@ void infer_column_types(parse_options const& parse_opts,
     auto const& stats = column_stats[inf_col_idx++];
     unsigned long long int_count_total =
       stats.big_int_count + stats.negative_small_int_count + stats.positive_small_int_count;
-
-    if (stats.null_count == num_records) {
+    if (stats.null_count == num_records or stats.total_count() == 0) {
       // Entire column is NULL; allocate the smallest amount of memory
       column_types[col_idx] = data_type(cudf::type_id::INT8);
     } else if (stats.string_count > 0L) {
@@ -679,32 +676,37 @@ table_with_metadata read_csv(cudf::io::datasource* source,
   auto const& data        = data_row_offsets.first;
   auto const& row_offsets = data_row_offsets.second;
 
-  // Exclude the end-of-data row from number of rows with actual data
-  auto num_records        = std::max(row_offsets.size(), 1ul) - 1;
-  auto column_flags       = std::vector<column_parse::flags>();
-  auto column_names       = std::vector<std::string>();
-  auto num_actual_columns = static_cast<int32_t>(reader_opts.get_names().size());
-  auto num_active_columns = num_actual_columns;
-
-  // Check if the user gave us a list of column names
-  if (not reader_opts.get_names().empty()) {
-    column_flags.resize(reader_opts.get_names().size(),
-                        column_parse::enabled | column_parse::inferred);
-    column_names = reader_opts.get_names();
-  } else {
-    column_names = get_column_names(
-      header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
-
-    num_actual_columns = num_active_columns = column_names.size();
-
-    column_flags.resize(num_actual_columns, column_parse::enabled | column_parse::inferred);
-
+  auto const unique_use_cols_indexes = std::set(reader_opts.get_use_cols_indexes().cbegin(),
+                                                reader_opts.get_use_cols_indexes().cend());
+
+  auto const detected_column_names =
+    get_column_names(header, parse_opts.view(), reader_opts.get_header(), reader_opts.get_prefix());
+  auto const opts_have_all_col_names =
+    not reader_opts.get_names().empty() and
+    (
+      // no data to detect (the number of) columns
+      detected_column_names.empty() or
+      // number of user specified names matches what is detected
+      reader_opts.get_names().size() == detected_column_names.size() or
+      // Columns are not selected by indices; read first reader_opts.get_names().size() columns
+      unique_use_cols_indexes.empty());
+  auto column_names = opts_have_all_col_names ? reader_opts.get_names() : detected_column_names;
+
+  auto const num_actual_columns = static_cast<int32_t>(column_names.size());
+  auto num_active_columns       = num_actual_columns;
+  auto column_flags             = std::vector<column_parse::flags>(
+    num_actual_columns, column_parse::enabled | column_parse::inferred);
+
+  // User did not pass column names to override names in the file
+  // Process names from the file to remove empty and duplicated strings
+  if (not opts_have_all_col_names) {
     std::vector<size_t> col_loop_order(column_names.size());
     auto unnamed_it = std::copy_if(
       thrust::make_counting_iterator<size_t>(0),
       thrust::make_counting_iterator<size_t>(column_names.size()),
       col_loop_order.begin(),
       [&column_names](auto col_idx) -> bool { return not column_names[col_idx].empty(); });
+
     // Rename empty column names to "Unnamed: col_index"
     std::copy_if(thrust::make_counting_iterator<size_t>(0),
                  thrust::make_counting_iterator<size_t>(column_names.size()),
@@ -759,24 +761,44 @@ table_with_metadata read_csv(cudf::io::datasource* source,
   }
 
   // User can specify which columns should be parsed
-  if (!reader_opts.get_use_cols_indexes().empty() || !reader_opts.get_use_cols_names().empty()) {
+  auto const unique_use_cols_names = std::unordered_set(reader_opts.get_use_cols_names().cbegin(),
+                                                        reader_opts.get_use_cols_names().cend());
+  auto const is_column_selection_used =
+    not unique_use_cols_names.empty() or not unique_use_cols_indexes.empty();
+
+  // Reset flags and output column count; columns will be reactivated based on the selection options
+  if (is_column_selection_used) {
     std::fill(column_flags.begin(), column_flags.end(), column_parse::disabled);
+    num_active_columns = 0;
+  }
 
-    for (const auto index : reader_opts.get_use_cols_indexes()) {
+  // Column selection via column indexes
+  if (not unique_use_cols_indexes.empty()) {
+    // Users can pass names for the selected columns only, if selecting column by their indices
+    auto const are_opts_col_names_used =
+      not reader_opts.get_names().empty() and not opts_have_all_col_names;
+    CUDF_EXPECTS(not are_opts_col_names_used or
+                   reader_opts.get_names().size() == unique_use_cols_indexes.size(),
+                 "Specify names of all columns in the file, or names of all selected columns");
+
+    for (auto const index : unique_use_cols_indexes) {
       column_flags[index] = column_parse::enabled | column_parse::inferred;
+      if (are_opts_col_names_used) {
+        column_names[index] = reader_opts.get_names()[num_active_columns];
+      }
+      ++num_active_columns;
     }
-    num_active_columns = std::unordered_set<int>(reader_opts.get_use_cols_indexes().begin(),
-                                                 reader_opts.get_use_cols_indexes().end())
-                           .size();
+  }
 
-    for (const auto& name : reader_opts.get_use_cols_names()) {
-      const auto it = std::find(column_names.begin(), column_names.end(), name);
-      if (it != column_names.end()) {
-        auto curr_it = it - column_names.begin();
-        if (column_flags[curr_it] == column_parse::disabled) {
-          column_flags[curr_it] = column_parse::enabled | column_parse::inferred;
-          num_active_columns++;
-        }
+  // Column selection via column names
+  if (not unique_use_cols_names.empty()) {
+    for (auto const& name : unique_use_cols_names) {
+      auto const it = std::find(column_names.cbegin(), column_names.cend(), name);
+      CUDF_EXPECTS(it != column_names.end(), "Nonexistent column selected");
+      auto const col_idx = std::distance(column_names.cbegin(), it);
+      if (column_flags[col_idx] == column_parse::disabled) {
+        column_flags[col_idx] = column_parse::enabled | column_parse::inferred;
+        ++num_active_columns;
       }
     }
   }
@@ -813,6 +835,8 @@ table_with_metadata read_csv(cudf::io::datasource* source,
   // Return empty table rather than exception if nothing to load
   if (num_active_columns == 0) { return {std::make_unique<table>(), {}}; }
 
+  // Exclude the end-of-data row from number of rows with actual data
+  auto const num_records  = std::max(row_offsets.size(), 1ul) - 1;
   auto const column_types = determine_column_types(
     reader_opts, parse_opts, column_names, data, row_offsets, num_records, column_flags, stream);
 
diff --git a/cpp/src/io/csv/writer_impl.cu b/cpp/src/io/csv/writer_impl.cu
index 2fae7b4c75a..ed2f412f291 100644
--- a/cpp/src/io/csv/writer_impl.cu
+++ b/cpp/src/io/csv/writer_impl.cu
@@ -279,21 +279,21 @@ struct column_to_strings_fn {
 //
 void write_chunked_begin(data_sink* out_sink,
                          table_view const& table,
-                         table_metadata const* metadata,
+                         host_span<std::string const> user_column_names,
                          csv_writer_options const& options,
                          rmm::cuda_stream_view stream,
                          rmm::mr::device_memory_resource* mr)
 {
   if (options.is_enabled_include_header()) {
-    // need to generate column names if metadata is not provided
+    // need to generate column names if names are not provided
     std::vector<std::string> generated_col_names;
-    if (metadata == nullptr) {
+    if (user_column_names.empty()) {
       generated_col_names.resize(table.num_columns());
       thrust::tabulate(generated_col_names.begin(), generated_col_names.end(), [](auto idx) {
         return std::to_string(idx);
       });
     }
-    auto const& column_names = (metadata == nullptr) ? generated_col_names : metadata->column_names;
+    auto const& column_names = user_column_names.empty() ? generated_col_names : user_column_names;
     CUDF_EXPECTS(column_names.size() == static_cast<size_t>(table.num_columns()),
                  "Mismatch between number of column headers and table columns.");
 
@@ -346,7 +346,6 @@ void write_chunked_begin(data_sink* out_sink,
 
 void write_chunked(data_sink* out_sink,
                    strings_column_view const& str_column_view,
-                   table_metadata const* metadata,
                    csv_writer_options const& options,
                    rmm::cuda_stream_view stream,
                    rmm::mr::device_memory_resource* mr)
@@ -365,8 +364,11 @@ void write_chunked(data_sink* out_sink,
   CUDF_EXPECTS(str_column_view.size() > 0, "Unexpected empty strings column.");
 
   cudf::string_scalar newline{options.get_line_terminator()};
-  auto p_str_col_w_nl =
-    cudf::strings::detail::join_strings(str_column_view, newline, string_scalar("", false), stream);
+  auto p_str_col_w_nl = cudf::strings::detail::join_strings(str_column_view,
+                                                            newline,
+                                                            string_scalar("", false),
+                                                            stream,
+                                                            rmm::mr::get_current_device_resource());
   strings_column_view strings_column{p_str_col_w_nl->view()};
 
   auto total_num_bytes      = strings_column.chars_size();
@@ -399,7 +401,7 @@ void write_chunked(data_sink* out_sink,
 
 void write_csv(data_sink* out_sink,
                table_view const& table,
-               table_metadata const* metadata,
+               host_span<std::string const> user_column_names,
                csv_writer_options const& options,
                rmm::cuda_stream_view stream,
                rmm::mr::device_memory_resource* mr)
@@ -407,7 +409,7 @@ void write_csv(data_sink* out_sink,
   // write header: column names separated by delimiter:
   // (even for tables with no rows)
   //
-  write_chunked_begin(out_sink, table, metadata, options, stream, mr);
+  write_chunked_begin(out_sink, table, user_column_names, options, stream, mr);
 
   if (table.num_rows() > 0) {
     // no need to check same-size columns constraint; auto-enforced by table_view
@@ -471,12 +473,14 @@ void write_csv(data_sink* out_sink,
                                                     delimiter_str,
                                                     options.get_na_rep(),
                                                     strings::separator_on_nulls::YES,
-                                                    stream);
+                                                    stream,
+                                                    rmm::mr::get_current_device_resource());
         cudf::string_scalar narep{options.get_na_rep()};
-        return cudf::strings::detail::replace_nulls(str_table_view.column(0), narep, stream);
+        return cudf::strings::detail::replace_nulls(
+          str_table_view.column(0), narep, stream, rmm::mr::get_current_device_resource());
       }();
 
-      write_chunked(out_sink, str_concat_col->view(), metadata, options, stream, mr);
+      write_chunked(out_sink, str_concat_col->view(), options, stream, mr);
     }
   }
 }
diff --git a/cpp/src/io/fst/logical_stack.cuh b/cpp/src/io/fst/logical_stack.cuh
index 9502922a379..b23a3d756df 100644
--- a/cpp/src/io/fst/logical_stack.cuh
+++ b/cpp/src/io/fst/logical_stack.cuh
@@ -267,7 +267,7 @@ void sparse_stack_op_to_top_of_stack(StackSymbolItT d_symbols,
                                      StackSymbolT const empty_stack_symbol,
                                      StackSymbolT const read_symbol,
                                      std::size_t const num_symbols_out,
-                                     rmm::cuda_stream_view stream = cudf::default_stream_value)
+                                     rmm::cuda_stream_view stream = cudf::get_default_stream())
 {
   rmm::device_buffer temp_storage{};
 
diff --git a/cpp/src/io/functions.cpp b/cpp/src/io/functions.cpp
index aabaa941daf..1a5a43d2b90 100644
--- a/cpp/src/io/functions.cpp
+++ b/cpp/src/io/functions.cpp
@@ -34,6 +34,8 @@
 #include <cudf/utilities/error.hpp>
 #include <io/orc/orc.hpp>
 
+#include <cudf/detail/iterator.cuh>
+
 namespace cudf {
 namespace io {
 // Returns builder for csv_reader_options
@@ -156,7 +158,7 @@ table_with_metadata read_avro(avro_reader_options const& options,
 
   CUDF_EXPECTS(datasources.size() == 1, "Only a single source is currently supported.");
 
-  return avro::read_avro(std::move(datasources[0]), options, cudf::default_stream_value, mr);
+  return avro::read_avro(std::move(datasources[0]), options, cudf::get_default_stream(), mr);
 }
 
 compression_type infer_compression_type(compression_type compression, source_info const& info)
@@ -198,7 +200,7 @@ table_with_metadata read_json(json_reader_options options, rmm::mr::device_memor
                                       options.get_byte_range_offset(),
                                       options.get_byte_range_size_with_padding());
 
-  return detail::json::read_json(datasources, options, cudf::default_stream_value, mr);
+  return detail::json::read_json(datasources, options, cudf::get_default_stream(), mr);
 }
 
 table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_resource* mr)
@@ -216,7 +218,7 @@ table_with_metadata read_csv(csv_reader_options options, rmm::mr::device_memory_
   return cudf::io::detail::csv::read_csv(  //
     std::move(datasources[0]),
     options,
-    cudf::default_stream_value,
+    cudf::get_default_stream(),
     mr);
 }
 
@@ -231,9 +233,9 @@ void write_csv(csv_writer_options const& options, rmm::mr::device_memory_resourc
   return csv::write_csv(  //
     sinks[0].get(),
     options.get_table(),
-    options.get_metadata(),
+    options.get_names(),
     options,
-    cudf::default_stream_value,
+    cudf::get_default_stream(),
     mr);
 }
 
@@ -241,7 +243,7 @@ namespace detail_orc = cudf::io::detail::orc;
 
 raw_orc_statistics read_raw_orc_statistics(source_info const& src_info)
 {
-  auto stream = cudf::default_stream_value;
+  auto stream = cudf::get_default_stream();
   // Get source to read statistics from
   std::unique_ptr<datasource> source;
   if (src_info.type() == io_type::FILEPATH) {
@@ -337,6 +339,40 @@ parsed_orc_statistics read_parsed_orc_statistics(source_info const& src_info)
 
   return result;
 }
+namespace {
+orc_column_schema make_orc_column_schema(host_span<orc::SchemaType const> orc_schema,
+                                         uint32_t column_id,
+                                         std::string column_name)
+{
+  auto const& orc_col_schema = orc_schema[column_id];
+  std::vector<orc_column_schema> children;
+  children.reserve(orc_col_schema.subtypes.size());
+  std::transform(
+    orc_col_schema.subtypes.cbegin(),
+    orc_col_schema.subtypes.cend(),
+    cudf::detail::make_counting_transform_iterator(0,
+                                                   [&names = orc_col_schema.fieldNames](size_t i) {
+                                                     return i < names.size() ? names[i]
+                                                                             : std::string{};
+                                                   }),
+    std::back_inserter(children),
+    [&](auto& type, auto name) { return make_orc_column_schema(orc_schema, type, name); });
+
+  return {std::move(column_name), orc_schema[column_id].kind, std::move(children)};
+}
+};  // namespace
+
+orc_metadata read_orc_metadata(source_info const& src_info)
+{
+  auto sources = make_datasources(src_info);
+
+  CUDF_EXPECTS(sources.size() == 1, "Only a single source is currently supported.");
+  auto const footer = orc::metadata(sources.front().get(), cudf::detail::default_stream_value).ff;
+
+  return {{make_orc_column_schema(footer.types, 0, "")},
+          static_cast<size_type>(footer.numberOfRows),
+          static_cast<size_type>(footer.stripes.size())};
+}
 
 /**
  * @copydoc cudf::io::read_orc
@@ -347,9 +383,9 @@ table_with_metadata read_orc(orc_reader_options const& options, rmm::mr::device_
 
   auto datasources = make_datasources(options.get_source());
   auto reader      = std::make_unique<detail_orc::reader>(
-    std::move(datasources), options, cudf::default_stream_value, mr);
+    std::move(datasources), options, cudf::get_default_stream(), mr);
 
-  return reader->read(options);
+  return reader->read(options, cudf::get_default_stream());
 }
 
 /**
@@ -365,7 +401,7 @@ void write_orc(orc_writer_options const& options, rmm::mr::device_memory_resourc
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing");
 
   auto writer = std::make_unique<detail_orc::writer>(
-    std::move(sinks[0]), options, io_detail::SingleWriteMode::YES, cudf::default_stream_value, mr);
+    std::move(sinks[0]), options, io_detail::SingleWriteMode::YES, cudf::get_default_stream(), mr);
 
   writer->write(options.get_table());
 }
@@ -382,7 +418,7 @@ orc_chunked_writer::orc_chunked_writer(chunked_orc_writer_options const& options
   CUDF_EXPECTS(sinks.size() == 1, "Multiple sinks not supported for ORC writing");
 
   writer = std::make_unique<detail_orc::writer>(
-    std::move(sinks[0]), options, io_detail::SingleWriteMode::NO, cudf::default_stream_value, mr);
+    std::move(sinks[0]), options, io_detail::SingleWriteMode::NO, cudf::get_default_stream(), mr);
 }
 
 /**
@@ -417,7 +453,7 @@ table_with_metadata read_parquet(parquet_reader_options const& options,
 
   auto datasources = make_datasources(options.get_source());
   auto reader      = std::make_unique<detail_parquet::reader>(
-    std::move(datasources), options, cudf::default_stream_value, mr);
+    std::move(datasources), options, cudf::get_default_stream(), mr);
 
   return reader->read(options);
 }
@@ -458,13 +494,52 @@ std::unique_ptr<std::vector<uint8_t>> write_parquet(parquet_writer_options const
 
   auto sinks  = make_datasinks(options.get_sink());
   auto writer = std::make_unique<detail_parquet::writer>(
-    std::move(sinks), options, io_detail::SingleWriteMode::YES, cudf::default_stream_value, mr);
+    std::move(sinks), options, io_detail::SingleWriteMode::YES, cudf::get_default_stream(), mr);
 
   writer->write(options.get_table(), options.get_partitions());
 
   return writer->close(options.get_column_chunks_file_paths());
 }
 
+/**
+ * @copydoc cudf::io::chunked_parquet_reader::chunked_parquet_reader
+ */
+chunked_parquet_reader::chunked_parquet_reader(std::size_t chunk_read_limit,
+                                               parquet_reader_options const& options,
+                                               rmm::mr::device_memory_resource* mr)
+  : reader{std::make_unique<detail_parquet::chunked_reader>(chunk_read_limit,
+                                                            make_datasources(options.get_source()),
+                                                            options,
+                                                            cudf::get_default_stream(),
+                                                            mr)}
+{
+}
+
+/**
+ * @copydoc cudf::io::chunked_parquet_reader::~chunked_parquet_reader
+ */
+chunked_parquet_reader::~chunked_parquet_reader() = default;
+
+/**
+ * @copydoc cudf::io::chunked_parquet_reader::has_next
+ */
+bool chunked_parquet_reader::has_next() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->has_next();
+}
+
+/**
+ * @copydoc cudf::io::chunked_parquet_reader::read_chunk
+ */
+table_with_metadata chunked_parquet_reader::read_chunk() const
+{
+  CUDF_FUNC_RANGE();
+  CUDF_EXPECTS(reader != nullptr, "Reader has not been constructed properly.");
+  return reader->read_chunk();
+}
+
 /**
  * @copydoc cudf::io::parquet_chunked_writer::parquet_chunked_writer
  */
@@ -476,7 +551,7 @@ parquet_chunked_writer::parquet_chunked_writer(chunked_parquet_writer_options co
   auto sinks = make_datasinks(options.get_sink());
 
   writer = std::make_unique<detail_parquet::writer>(
-    std::move(sinks), options, io_detail::SingleWriteMode::NO, cudf::default_stream_value, mr);
+    std::move(sinks), options, io_detail::SingleWriteMode::NO, cudf::get_default_stream(), mr);
 }
 
 /**
diff --git a/cpp/src/io/json/experimental/byte_range_info.cu b/cpp/src/io/json/experimental/byte_range_info.cu
new file mode 100644
index 00000000000..d6e30d090a5
--- /dev/null
+++ b/cpp/src/io/json/experimental/byte_range_info.cu
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+#include <thrust/find.h>
+
+namespace cudf::io::detail::json::experimental {
+
+// Extract the first character position in the string.
+size_type find_first_delimiter(device_span<char const> d_data,
+                               char const delimiter,
+                               rmm::cuda_stream_view stream)
+{
+  auto const first_delimiter_position =
+    thrust::find(rmm::exec_policy(stream), d_data.begin(), d_data.end(), delimiter);
+  return first_delimiter_position != d_data.end() ? first_delimiter_position - d_data.begin() : -1;
+}
+
+}  // namespace cudf::io::detail::json::experimental
diff --git a/cpp/src/io/json/experimental/read_json.cpp b/cpp/src/io/json/experimental/read_json.cpp
index c0eaa43e68f..87d196131ca 100644
--- a/cpp/src/io/json/experimental/read_json.cpp
+++ b/cpp/src/io/json/experimental/read_json.cpp
@@ -19,27 +19,135 @@
 #include <io/comp/io_uncomp.hpp>
 #include <io/json/nested_json.hpp>
 
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/utilities/error.hpp>
 
 #include <numeric>
 
 namespace cudf::io::detail::json::experimental {
 
-std::vector<uint8_t> ingest_raw_input(host_span<std::unique_ptr<datasource>> sources,
-                                      compression_type compression)
+size_t sources_size(host_span<std::unique_ptr<datasource>> const sources,
+                    size_t range_offset,
+                    size_t range_size)
 {
-  auto const total_source_size =
-    std::accumulate(sources.begin(), sources.end(), 0ul, [](size_t sum, auto& source) {
-      return sum + source->size();
-    });
-  auto buffer = std::vector<uint8_t>(total_source_size);
+  return std::accumulate(sources.begin(), sources.end(), 0ul, [=](size_t sum, auto& source) {
+    auto const size = source->size();
+    // TODO take care of 0, 0, or *, 0 case.
+    return sum +
+           (range_size == 0 or range_offset + range_size > size ? size - range_offset : range_size);
+  });
+}
+
+std::vector<uint8_t> ingest_raw_input(host_span<std::unique_ptr<datasource>> const& sources,
+                                      compression_type compression,
+                                      size_t range_offset,
+                                      size_t range_size)
+{
+  CUDF_FUNC_RANGE();
+  // Iterate through the user defined sources and read the contents into the local buffer
+  auto const total_source_size = sources_size(sources, range_offset, range_size);
+  auto buffer                  = std::vector<uint8_t>(total_source_size);
 
   size_t bytes_read = 0;
   for (const auto& source : sources) {
-    bytes_read += source->host_read(0, source->size(), buffer.data() + bytes_read);
+    if (!source->is_empty()) {
+      auto data_size   = (range_size != 0) ? range_size : source->size();
+      auto destination = buffer.data() + bytes_read;
+      bytes_read += source->host_read(range_offset, data_size, destination);
+    }
   }
 
-  return (compression == compression_type::NONE) ? buffer : decompress(compression, buffer);
+  if (compression == compression_type::NONE) {
+    return buffer;
+  } else {
+    return decompress(compression, buffer);
+  }
+}
+
+size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
+                                        json_reader_options const& reader_opts,
+                                        char const delimiter,
+                                        rmm::cuda_stream_view stream)
+{
+  auto const buffer = ingest_raw_input(sources,
+                                       reader_opts.get_compression(),
+                                       reader_opts.get_byte_range_offset(),
+                                       reader_opts.get_byte_range_size());
+  auto d_data       = rmm::device_uvector<char>(buffer.size(), stream);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data(),
+                                buffer.data(),
+                                buffer.size() * sizeof(decltype(buffer)::value_type),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  return find_first_delimiter(d_data, delimiter, stream);
+}
+
+size_type find_first_delimiter_in_chunk(host_span<unsigned char const> buffer,
+                                        char const delimiter,
+                                        rmm::cuda_stream_view stream)
+{
+  auto d_data = rmm::device_uvector<char>(buffer.size(), stream);
+  CUDF_CUDA_TRY(cudaMemcpyAsync(d_data.data(),
+                                buffer.data(),
+                                buffer.size() * sizeof(decltype(buffer)::value_type),
+                                cudaMemcpyHostToDevice,
+                                stream.value()));
+  return find_first_delimiter(d_data, delimiter, stream);
+}
+
+bool should_load_whole_source(json_reader_options const& reader_opts)
+{
+  return reader_opts.get_byte_range_offset() == 0 and  //
+         reader_opts.get_byte_range_size() == 0;
+}
+
+/**
+ * @brief Get the byte range between record starts and ends starting from the given range.
+ *
+ * if get_byte_range_offset == 0, then we can skip the first delimiter search
+ * if get_byte_range_offset != 0, then we need to search for the first delimiter in given range.
+ * if not found, skip this chunk, if found, then search for first delimiter in next range until we
+ * find a delimiter. Use this as actual range for parsing.
+ *
+ * @param sources Data sources to read from
+ * @param reader_opts JSON reader options with range offset and range size
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @return Byte range for parsing
+ */
+auto get_record_range_raw_input(host_span<std::unique_ptr<datasource>> sources,
+                                json_reader_options const& reader_opts,
+                                rmm::cuda_stream_view stream)
+{
+  auto buffer = ingest_raw_input(sources,
+                                 reader_opts.get_compression(),
+                                 reader_opts.get_byte_range_offset(),
+                                 reader_opts.get_byte_range_size());
+  if (should_load_whole_source(reader_opts)) return buffer;
+  auto first_delim_pos = reader_opts.get_byte_range_offset() == 0
+                           ? 0
+                           : find_first_delimiter_in_chunk(buffer, '\n', stream);
+  if (first_delim_pos == -1) {
+    return std::vector<uint8_t>{};
+  } else {
+    first_delim_pos = first_delim_pos + reader_opts.get_byte_range_offset();
+    // Find next delimiter
+    decltype(first_delim_pos) next_delim_pos = -1;
+    auto const total_source_size             = sources_size(sources, 0, 0);
+    auto current_offset = reader_opts.get_byte_range_offset() + reader_opts.get_byte_range_size();
+    while (current_offset < total_source_size and next_delim_pos == -1) {
+      buffer = ingest_raw_input(
+        sources, reader_opts.get_compression(), current_offset, reader_opts.get_byte_range_size());
+      next_delim_pos = find_first_delimiter_in_chunk(buffer, '\n', stream);
+      if (next_delim_pos == -1) { current_offset += reader_opts.get_byte_range_size(); }
+    }
+    if (next_delim_pos == -1) {
+      next_delim_pos = total_source_size;
+    } else {
+      next_delim_pos = next_delim_pos + current_offset;
+    }
+    return ingest_raw_input(
+      sources, reader_opts.get_compression(), first_delim_pos, next_delim_pos - first_delim_pos);
+  }
 }
 
 table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
@@ -47,10 +155,14 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(reader_opts.get_byte_range_offset() == 0 and reader_opts.get_byte_range_size() == 0,
-               "specifying a byte range is not yet supported");
+  CUDF_FUNC_RANGE();
+  if (not should_load_whole_source(reader_opts)) {
+    CUDF_EXPECTS(reader_opts.is_enabled_lines(),
+                 "specifying a byte range is supported only for json lines");
+  }
+
+  auto const buffer = get_record_range_raw_input(sources, reader_opts, stream);
 
-  auto const buffer = ingest_raw_input(sources, reader_opts.get_compression());
   auto data = host_span<char const>(reinterpret_cast<char const*>(buffer.data()), buffer.size());
 
   try {
diff --git a/cpp/src/io/json/experimental/read_json.hpp b/cpp/src/io/json/experimental/read_json.hpp
index c9f74b2cc41..48e104c4254 100644
--- a/cpp/src/io/json/experimental/read_json.hpp
+++ b/cpp/src/io/json/experimental/read_json.hpp
@@ -33,4 +33,13 @@ table_with_metadata read_json(host_span<std::unique_ptr<datasource>> sources,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr);
 
-}
+size_type find_first_delimiter(device_span<char const> d_data,
+                               char const delimiter,
+                               rmm::cuda_stream_view stream);
+
+size_type find_first_delimiter_in_chunk(host_span<std::unique_ptr<cudf::io::datasource>> sources,
+                                        json_reader_options const& reader_opts,
+                                        char const delimiter,
+                                        rmm::cuda_stream_view stream);
+
+}  // namespace cudf::io::detail::json::experimental
diff --git a/cpp/src/io/json/json_column.cu b/cpp/src/io/json/json_column.cu
index d54bb5c8ea9..0ac3efb407e 100644
--- a/cpp/src/io/json/json_column.cu
+++ b/cpp/src/io/json/json_column.cu
@@ -73,7 +73,7 @@ auto print_vec = [](auto const& cpu, auto const name, auto converter) {
 
 void print_tree(host_span<SymbolT const> input,
                 tree_meta_t const& d_gpu_tree,
-                rmm::cuda_stream_view stream = cudf::default_stream_value)
+                rmm::cuda_stream_view stream)
 {
   print_vec(cudf::detail::make_std_vector_async(d_gpu_tree.node_categories, stream),
             "node_categories",
@@ -278,11 +278,11 @@ std::vector<std::string> copy_strings_to_host(device_span<SymbolT const> input,
     auto const scv     = cudf::strings_column_view(col);
     auto const h_chars = cudf::detail::make_std_vector_sync<char>(
       cudf::device_span<char const>(scv.chars().data<char>(), scv.chars().size()),
-      cudf::default_stream_value);
+      cudf::get_default_stream());
     auto const h_offsets = cudf::detail::make_std_vector_sync(
       cudf::device_span<cudf::offset_type const>(
         scv.offsets().data<cudf::offset_type>() + scv.offset(), scv.size() + 1),
-      cudf::default_stream_value);
+      cudf::get_default_stream());
 
     // build std::string vector from chars and offsets
     std::vector<std::string> host_data;
@@ -403,7 +403,7 @@ void make_device_json_column(device_span<SymbolT const> input,
     std::string name   = "";
     auto parent_col_id = column_parent_ids[this_col_id];
     if (parent_col_id == parent_node_sentinel || column_categories[parent_col_id] == NC_LIST) {
-      name = "element";
+      name = list_child_name;
     } else if (column_categories[parent_col_id] == NC_FN) {
       auto field_name_col_id = parent_col_id;
       parent_col_id          = column_parent_ids[parent_col_id];
@@ -525,14 +525,15 @@ void make_device_json_column(device_span<SymbolT const> input,
       auto parent_node_id = ordered_parent_node_ids[i];
       if (parent_node_id != parent_node_sentinel and node_categories[parent_node_id] == NC_LIST) {
         // unique item
-        if (i == 0 ||
+        if (i == 0 or
             (col_ids[i - 1] != col_ids[i] or ordered_parent_node_ids[i - 1] != parent_node_id)) {
           // scatter to list_offset
           d_columns_data[original_col_ids[parent_node_id]]
             .child_offsets[row_offsets[parent_node_id]] = ordered_row_offsets[i];
         }
         // TODO: verify if this code is right. check with more test cases.
-        if (i == num_nodes - 1 || (col_ids[i] != col_ids[i + 1])) {
+        if (i == num_nodes - 1 or
+            (col_ids[i] != col_ids[i + 1] or ordered_parent_node_ids[i + 1] != parent_node_id)) {
           // last value of list child_offset is its size.
           d_columns_data[original_col_ids[parent_node_id]]
             .child_offsets[row_offsets[parent_node_id] + 1] = ordered_row_offsets[i] + 1;
@@ -689,19 +690,24 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
       size_type num_rows = json_col.child_offsets.size() - 1;
       std::vector<column_name_info> column_names{};
       column_names.emplace_back("offsets");
-      column_names.emplace_back(json_col.child_columns.begin()->first);
+      column_names.emplace_back(
+        json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);
 
       // Note: json_col modified here, reuse the memory
       auto offsets_column = std::make_unique<column>(
         data_type{type_id::INT32}, num_rows + 1, json_col.child_offsets.release());
       // Create children column
       auto [child_column, names] =
-        device_json_column_to_cudf_column(json_col.child_columns.begin()->second,
-                                          d_input,
-                                          options,
-                                          get_child_schema(json_col.child_columns.begin()->first),
-                                          stream,
-                                          mr);
+        json_col.child_columns.empty()
+          ? std::pair<std::unique_ptr<column>,
+                      std::vector<column_name_info>>{std::make_unique<column>(), {}}
+          : device_json_column_to_cudf_column(
+              json_col.child_columns.begin()->second,
+              d_input,
+              options,
+              get_child_schema(json_col.child_columns.begin()->first),
+              stream,
+              mr);
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
       return {make_lists_column(num_rows,
@@ -717,16 +723,13 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> device_json_co
   }
 }
 
-table_with_metadata device_parse_nested_json(host_span<SymbolT const> input,
+table_with_metadata device_parse_nested_json(device_span<SymbolT const> d_input,
                                              cudf::io::json_reader_options const& options,
                                              rmm::cuda_stream_view stream,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
-  // Allocate device memory for the JSON input & copy over to device
-  rmm::device_uvector<SymbolT> d_input = cudf::detail::make_device_uvector_async(input, stream);
-
   auto gpu_tree = [&]() {
     // Parse the JSON and get the token stream
     const auto [tokens_gpu, token_indices_gpu] = get_token_stream(d_input, options, stream);
@@ -734,7 +737,8 @@ table_with_metadata device_parse_nested_json(host_span<SymbolT const> input,
     return get_tree_representation(tokens_gpu, token_indices_gpu, stream);
   }();  // IILE used to free memory of token data.
 #ifdef NJP_DEBUG_PRINT
-  print_tree(input, gpu_tree, stream);
+  auto h_input = cudf::detail::make_host_vector_async(d_input, stream);
+  print_tree(h_input, gpu_tree, stream);
 #endif
 
   auto [gpu_col_id, gpu_row_offsets] = records_orient_tree_traversal(d_input, gpu_tree, stream);
@@ -836,5 +840,17 @@ table_with_metadata device_parse_nested_json(host_span<SymbolT const> input,
                              {{}, out_column_names}};
 }
 
+table_with_metadata device_parse_nested_json(host_span<SymbolT const> input,
+                                             cudf::io::json_reader_options const& options,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+
+  // Allocate device memory for the JSON input & copy over to device
+  rmm::device_uvector<SymbolT> d_input = cudf::detail::make_device_uvector_async(input, stream);
+
+  return device_parse_nested_json(device_span<SymbolT const>{d_input}, options, stream, mr);
+}
 }  // namespace detail
 }  // namespace cudf::io::json
diff --git a/cpp/src/io/json/json_gpu.cu b/cpp/src/io/json/json_gpu.cu
index dbfcca7d37a..8b6c0f9d528 100644
--- a/cpp/src/io/json/json_gpu.cu
+++ b/cpp/src/io/json/json_gpu.cu
@@ -156,6 +156,7 @@ struct field_descriptor {
   cudf::size_type column;
   char const* value_begin;
   char const* value_end;
+  bool is_quoted;
 };
 
 /**
@@ -178,7 +179,10 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
   auto const desc_pre_trim =
     col_map.capacity() == 0
       // No key - column and begin are trivial
-      ? field_descriptor{field_idx, begin, cudf::io::gpu::seek_field_end(begin, end, opts, true)}
+      ? field_descriptor{field_idx,
+                         begin,
+                         cudf::io::gpu::seek_field_end(begin, end, opts, true),
+                         false}
       : [&]() {
           auto const key_range = get_next_key(begin, end, opts.quotechar);
           auto const key_hash  = cudf::detail::MurmurHash3_32<cudf::string_view>{}(
@@ -189,14 +193,23 @@ __device__ field_descriptor next_field_descriptor(const char* begin,
 
           // Skip the colon between the key and the value
           auto const value_begin = thrust::find(thrust::seq, key_range.second, end, ':') + 1;
-          return field_descriptor{
-            column, value_begin, cudf::io::gpu::seek_field_end(value_begin, end, opts, true)};
+          return field_descriptor{column,
+                                  value_begin,
+                                  cudf::io::gpu::seek_field_end(value_begin, end, opts, true),
+                                  false};
         }();
 
   // Modify start & end to ignore whitespace and quotechars
   auto const trimmed_value_range =
-    trim_whitespaces_quotes(desc_pre_trim.value_begin, desc_pre_trim.value_end, opts.quotechar);
-  return {desc_pre_trim.column, trimmed_value_range.first, trimmed_value_range.second};
+    trim_whitespaces(desc_pre_trim.value_begin, desc_pre_trim.value_end);
+  bool const is_quoted =
+    thrust::distance(trimmed_value_range.first, trimmed_value_range.second) >= 2 and
+    *trimmed_value_range.first == opts.quotechar and
+    *thrust::prev(trimmed_value_range.second) == opts.quotechar;
+  return {desc_pre_trim.column,
+          trimmed_value_range.first + static_cast<std::ptrdiff_t>(is_quoted),
+          trimmed_value_range.second - static_cast<std::ptrdiff_t>(is_quoted),
+          is_quoted};
 }
 
 /**
@@ -255,13 +268,14 @@ __global__ void convert_data_to_columns_kernel(parse_options_view opts,
     auto const desc =
       next_field_descriptor(current, row_data_range.second, opts, input_field_index, col_map);
     auto const value_len = static_cast<size_t>(std::max(desc.value_end - desc.value_begin, 0L));
+    auto const is_quoted = static_cast<std::ptrdiff_t>(desc.is_quoted);
 
     current = desc.value_end + 1;
 
     using string_index_pair = thrust::pair<const char*, size_type>;
 
-    // Empty fields are not legal values
-    if (!serialized_trie_contains(opts.trie_na, {desc.value_begin, value_len})) {
+    if (!serialized_trie_contains(opts.trie_na,
+                                  {desc.value_begin - is_quoted, value_len + is_quoted * 2})) {
       // Type dispatcher does not handle strings
       if (column_types[desc.column].id() == type_id::STRING) {
         auto str_list           = static_cast<string_index_pair*>(output_columns[desc.column]);
@@ -345,7 +359,7 @@ __global__ void detect_data_types_kernel(
       atomicAdd(&column_infos[desc.column].null_count, -1);
     }
     // Don't need counts to detect strings, any field in quotes is deduced to be a string
-    if (*(desc.value_begin - 1) == opts.quotechar && *desc.value_end == opts.quotechar) {
+    if (desc.is_quoted) {
       atomicAdd(&column_infos[desc.column].string_count, 1);
       continue;
     }
diff --git a/cpp/src/io/json/json_tree.cu b/cpp/src/io/json/json_tree.cu
index dbf026c351e..50755724c51 100644
--- a/cpp/src/io/json/json_tree.cu
+++ b/cpp/src/io/json/json_tree.cu
@@ -29,19 +29,25 @@
 
 #include <cuco/static_map.cuh>
 
+#include <cub/device/device_radix_sort.cuh>
+
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 #include <rmm/mr/device/polymorphic_allocator.hpp>
 
+#include <thrust/binary_search.h>
 #include <thrust/copy.h>
 #include <thrust/count.h>
 #include <thrust/fill.h>
 #include <thrust/gather.h>
 #include <thrust/iterator/counting_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/permutation_iterator.h>
 #include <thrust/iterator/transform_output_iterator.h>
 #include <thrust/iterator/zip_iterator.h>
 #include <thrust/reduce.h>
+#include <thrust/remove.h>
 #include <thrust/scan.h>
 #include <thrust/sequence.h>
 #include <thrust/sort.h>
@@ -79,7 +85,7 @@ struct node_ranges {
   __device__ auto operator()(size_type i) -> thrust::tuple<SymbolOffsetT, SymbolOffsetT>
   {
     // Whether a token expects to be followed by its respective end-of-* token partner
-    auto is_begin_of_section = [] __device__(PdaTokenT const token) {
+    auto const is_begin_of_section = [] __device__(PdaTokenT const token) {
       switch (token) {
         case token_t::StringBegin:
         case token_t::ValueBegin:
@@ -88,7 +94,7 @@ struct node_ranges {
       };
     };
     // The end-of-* partner token for a given beginning-of-* token
-    auto end_of_partner = [] __device__(PdaTokenT const token) {
+    auto const end_of_partner = [] __device__(PdaTokenT const token) {
       switch (token) {
         case token_t::StringBegin: return token_t::StringEnd;
         case token_t::ValueBegin: return token_t::ValueEnd;
@@ -98,8 +104,8 @@ struct node_ranges {
     };
     // Includes quote char for end-of-string token or Skips the quote char for
     // beginning-of-field-name token
-    auto get_token_index = [include_quote_char = include_quote_char] __device__(
-                             PdaTokenT const token, SymbolOffsetT const token_index) {
+    auto const get_token_index = [include_quote_char = include_quote_char] __device__(
+                                   PdaTokenT const token, SymbolOffsetT const token_index) {
       constexpr SymbolOffsetT quote_char_size = 1;
       switch (token) {
         // Strip off quote char included for StringBegin
@@ -125,6 +131,81 @@ struct node_ranges {
   }
 };
 
+/**
+ * @brief Returns stable sorted keys and its sorted order
+ *
+ * Uses cub stable radix sort. The order is internally generated, hence it saves a copy and memory.
+ * Since the key and order is returned, using double buffer helps to avoid extra copy to user
+ * provided output iterator.
+ *
+ * @tparam IndexType sorted order type
+ * @tparam KeyType key type
+ * @param keys keys to sort
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ * @return Sorted keys and indices producing that sorted order
+ */
+template <typename IndexType = size_t, typename KeyType>
+std::pair<rmm::device_uvector<KeyType>, rmm::device_uvector<IndexType>> stable_sorted_key_order(
+  cudf::device_span<KeyType const> keys, rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+
+  // Determine temporary device storage requirements
+  rmm::device_uvector<KeyType> keys_buffer1(keys.size(), stream);
+  rmm::device_uvector<KeyType> keys_buffer2(keys.size(), stream);
+  rmm::device_uvector<IndexType> order_buffer1(keys.size(), stream);
+  rmm::device_uvector<IndexType> order_buffer2(keys.size(), stream);
+  cub::DoubleBuffer<IndexType> order_buffer(order_buffer1.data(), order_buffer2.data());
+  cub::DoubleBuffer<KeyType> keys_buffer(keys_buffer1.data(), keys_buffer2.data());
+  size_t temp_storage_bytes = 0;
+  cub::DeviceRadixSort::SortPairs(
+    nullptr, temp_storage_bytes, keys_buffer, order_buffer, keys.size());
+  rmm::device_buffer d_temp_storage(temp_storage_bytes, stream);
+
+  thrust::copy(rmm::exec_policy(stream), keys.begin(), keys.end(), keys_buffer1.begin());
+  thrust::sequence(rmm::exec_policy(stream), order_buffer1.begin(), order_buffer1.end());
+
+  cub::DeviceRadixSort::SortPairs(d_temp_storage.data(),
+                                  temp_storage_bytes,
+                                  keys_buffer,
+                                  order_buffer,
+                                  keys.size(),
+                                  0,
+                                  sizeof(KeyType) * 8,
+                                  stream.value());
+
+  return std::pair{keys_buffer.Current() == keys_buffer1.data() ? std::move(keys_buffer1)
+                                                                : std::move(keys_buffer2),
+                   order_buffer.Current() == order_buffer1.data() ? std::move(order_buffer1)
+                                                                  : std::move(order_buffer2)};
+}
+
+/**
+ * @brief Propagate parent node to siblings from first sibling.
+ *
+ * @param node_levels Node levels of each node
+ * @param parent_node_ids parent node ids initialized for first child of each push node,
+ *                       and other siblings are initialized to -1.
+ * @param stream CUDA stream used for device memory operations and kernel launches.
+ */
+void propagate_parent_to_siblings(cudf::device_span<TreeDepthT const> node_levels,
+                                  cudf::device_span<NodeIndexT> parent_node_ids,
+                                  rmm::cuda_stream_view stream)
+{
+  CUDF_FUNC_RANGE();
+  auto [sorted_node_levels, sorted_order] = stable_sorted_key_order<size_type>(node_levels, stream);
+  // instead of gather, using permutation_iterator, which is ~17% faster
+
+  thrust::inclusive_scan_by_key(
+    rmm::exec_policy(stream),
+    sorted_node_levels.begin(),
+    sorted_node_levels.end(),
+    thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
+    thrust::make_permutation_iterator(parent_node_ids.begin(), sorted_order.begin()),
+    thrust::equal_to<TreeDepthT>{},
+    thrust::maximum<NodeIndexT>{});
+}
+
 // Generates a tree representation of the given tokens, token_indices.
 tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                                     device_span<SymbolOffsetT const> token_indices,
@@ -133,7 +214,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
 {
   CUDF_FUNC_RANGE();
   // Whether a token does represent a node in the tree representation
-  auto is_node = [] __device__(PdaTokenT const token) -> bool {
+  auto const is_node = [] __device__(PdaTokenT const token) -> bool {
     switch (token) {
       case token_t::StructBegin:
       case token_t::ListBegin:
@@ -146,7 +227,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   };
 
   // Whether the token pops from the parent node stack
-  auto does_pop = [] __device__(PdaTokenT const token) -> bool {
+  auto const does_pop = [] __device__(PdaTokenT const token) -> bool {
     switch (token) {
       case token_t::StructMemberEnd:
       case token_t::StructEnd:
@@ -156,7 +237,7 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
   };
 
   // Whether the token pushes onto the parent node stack
-  auto does_push = [] __device__(PdaTokenT const token) -> bool {
+  auto const does_push = [] __device__(PdaTokenT const token) -> bool {
     switch (token) {
       case token_t::FieldNameBegin:
       case token_t::StructBegin:
@@ -165,55 +246,126 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
     };
   };
 
-  auto num_tokens = tokens.size();
-  auto is_node_it = thrust::make_transform_iterator(
-    tokens.begin(),
-    [is_node] __device__(auto t) -> size_type { return static_cast<size_type>(is_node(t)); });
-  auto num_nodes = thrust::count_if(
-    rmm::exec_policy(stream), tokens.begin(), tokens.begin() + num_tokens, is_node);
+  // Look for ErrorBegin and report the point of error.
+  if (auto const error_count =
+        thrust::count(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
+      error_count > 0) {
+    auto const error_location =
+      thrust::find(rmm::exec_policy(stream), tokens.begin(), tokens.end(), token_t::ErrorBegin);
+    SymbolOffsetT error_index;
+    CUDF_CUDA_TRY(
+      cudaMemcpyAsync(&error_index,
+                      token_indices.data() + thrust::distance(tokens.begin(), error_location),
+                      sizeof(SymbolOffsetT),
+                      cudaMemcpyDeviceToHost,
+                      stream.value()));
+    stream.synchronize();
+    CUDF_FAIL("JSON Parser encountered an invalid format at location " +
+              std::to_string(error_index));
+  }
+
+  auto const num_tokens = tokens.size();
+  auto const num_nodes =
+    thrust::count_if(rmm::exec_policy(stream), tokens.begin(), tokens.end(), is_node);
+
+  // Node levels: transform_exclusive_scan, copy_if.
+  rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
+  {
+    rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
+    auto const push_pop_it = thrust::make_transform_iterator(
+      tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
+        return does_push(token) - does_pop(token);
+      });
+    thrust::exclusive_scan(
+      rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());
+
+    auto const node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
+                                                 token_levels.begin(),
+                                                 token_levels.end(),
+                                                 tokens.begin(),
+                                                 node_levels.begin(),
+                                                 is_node);
+    CUDF_EXPECTS(thrust::distance(node_levels.begin(), node_levels_end) == num_nodes,
+                 "node level count mismatch");
+  }
+
+  // Node parent ids:
+  // previous push node_id transform, stable sort by level, segmented scan with Max, reorder.
+  rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
+  // This block of code is generalized logical stack algorithm. TODO: make this a seperate function.
+  {
+    rmm::device_uvector<NodeIndexT> node_token_ids(num_nodes, stream);
+    thrust::copy_if(rmm::exec_policy(stream),
+                    thrust::make_counting_iterator<NodeIndexT>(0),
+                    thrust::make_counting_iterator<NodeIndexT>(0) + num_tokens,
+                    tokens.begin(),
+                    node_token_ids.begin(),
+                    is_node);
+
+    // previous push node_id
+    // if previous node is a push, then i-1
+    // if previous node is FE, then i-2 (returns FB's index)
+    // if previous node is SMB and its previous node is a push, then i-2
+    // eg. `{ SMB FB FE VB VE SME` -> `{` index as FB's parent.
+    // else -1
+    auto const first_childs_parent_token_id = [tokens_gpu =
+                                                 tokens.begin()] __device__(auto i) -> NodeIndexT {
+      if (i <= 0) { return -1; }
+      if (tokens_gpu[i - 1] == token_t::StructBegin or tokens_gpu[i - 1] == token_t::ListBegin) {
+        return i - 1;
+      } else if (tokens_gpu[i - 1] == token_t::FieldNameEnd) {
+        return i - 2;
+      } else if (tokens_gpu[i - 1] == token_t::StructMemberBegin and
+                 (tokens_gpu[i - 2] == token_t::StructBegin ||
+                  tokens_gpu[i - 2] == token_t::ListBegin)) {
+        return i - 2;
+      } else {
+        return -1;
+      }
+    };
+
+    thrust::transform(
+      rmm::exec_policy(stream),
+      node_token_ids.begin(),
+      node_token_ids.end(),
+      parent_node_ids.begin(),
+      [node_ids_gpu = node_token_ids.begin(), num_nodes, first_childs_parent_token_id] __device__(
+        NodeIndexT const tid) -> NodeIndexT {
+        auto const pid = first_childs_parent_token_id(tid);
+        return pid < 0
+                 ? parent_node_sentinel
+                 : thrust::lower_bound(thrust::seq, node_ids_gpu, node_ids_gpu + num_nodes, pid) -
+                     node_ids_gpu;
+        // parent_node_sentinel is -1, useful for segmented max operation below
+      });
+  }
+  // Propagate parent node to siblings from first sibling - inplace.
+  propagate_parent_to_siblings(
+    cudf::device_span<TreeDepthT const>{node_levels.data(), node_levels.size()},
+    parent_node_ids,
+    stream);
 
   // Node categories: copy_if with transform.
   rmm::device_uvector<NodeT> node_categories(num_nodes, stream, mr);
-  auto node_categories_it =
+  auto const node_categories_it =
     thrust::make_transform_output_iterator(node_categories.begin(), token_to_node{});
-  auto node_categories_end = thrust::copy_if(rmm::exec_policy(stream),
-                                             tokens.begin(),
-                                             tokens.begin() + num_tokens,
-                                             node_categories_it,
-                                             is_node);
+  auto const node_categories_end = thrust::copy_if(
+    rmm::exec_policy(stream), tokens.begin(), tokens.end(), node_categories_it, is_node);
   CUDF_EXPECTS(node_categories_end - node_categories_it == num_nodes,
                "node category count mismatch");
 
-  // Node levels: transform_exclusive_scan, copy_if.
-  rmm::device_uvector<TreeDepthT> token_levels(num_tokens, stream);
-  auto push_pop_it = thrust::make_transform_iterator(
-    tokens.begin(), [does_push, does_pop] __device__(PdaTokenT const token) -> size_type {
-      return does_push(token) - does_pop(token);
-    });
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), push_pop_it, push_pop_it + num_tokens, token_levels.begin());
-
-  rmm::device_uvector<TreeDepthT> node_levels(num_nodes, stream, mr);
-  auto node_levels_end = thrust::copy_if(rmm::exec_policy(stream),
-                                         token_levels.begin(),
-                                         token_levels.begin() + num_tokens,
-                                         tokens.begin(),
-                                         node_levels.begin(),
-                                         is_node);
-  CUDF_EXPECTS(node_levels_end - node_levels.begin() == num_nodes, "node level count mismatch");
-
   // Node ranges: copy_if with transform.
   rmm::device_uvector<SymbolOffsetT> node_range_begin(num_nodes, stream, mr);
   rmm::device_uvector<SymbolOffsetT> node_range_end(num_nodes, stream, mr);
-  auto node_range_tuple_it =
+  auto const node_range_tuple_it =
     thrust::make_zip_iterator(node_range_begin.begin(), node_range_end.begin());
   // Whether the tokenizer stage should keep quote characters for string values
   // If the tokenizer keeps the quote characters, they may be stripped during type casting
   constexpr bool include_quote_char = true;
-  auto node_range_out_it            = thrust::make_transform_output_iterator(
+  auto const node_range_out_it      = thrust::make_transform_output_iterator(
     node_range_tuple_it, node_ranges{tokens, token_indices, include_quote_char});
 
-  auto node_range_out_end =
+  auto const node_range_out_end =
     thrust::copy_if(rmm::exec_policy(stream),
                     thrust::make_counting_iterator<size_type>(0),
                     thrust::make_counting_iterator<size_type>(0) + num_tokens,
@@ -223,69 +375,6 @@ tree_meta_t get_tree_representation(device_span<PdaTokenT const> tokens,
                     });
   CUDF_EXPECTS(node_range_out_end - node_range_out_it == num_nodes, "node range count mismatch");
 
-  // Node parent ids: previous push token_id transform, stable sort, segmented scan with Max,
-  // reorder, copy_if. This one is sort of logical stack. But more generalized.
-  // TODO: make it own function.
-  rmm::device_uvector<size_type> parent_token_ids(num_tokens, stream);
-  rmm::device_uvector<size_type> initial_order(num_tokens, stream);
-  // TODO re-write the algorithm to work only on nodes, not tokens.
-
-  thrust::sequence(rmm::exec_policy(stream), initial_order.begin(), initial_order.end());
-  thrust::tabulate(rmm::exec_policy(stream),
-                   parent_token_ids.begin(),
-                   parent_token_ids.end(),
-                   [does_push, tokens_gpu = tokens.begin()] __device__(auto i) -> size_type {
-                     return (i > 0) && does_push(tokens_gpu[i - 1]) ? i - 1 : -1;
-                     // -1, not sentinel used here because of max operation below
-                   });
-
-  auto out_pid = thrust::make_zip_iterator(parent_token_ids.data(), initial_order.data());
-  // Uses radix sort for builtin types.
-  thrust::stable_sort_by_key(rmm::exec_policy(stream),
-                             token_levels.data(),
-                             token_levels.data() + token_levels.size(),
-                             out_pid);
-
-  // SegmentedScan Max.
-  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
-                                token_levels.data(),
-                                token_levels.data() + token_levels.size(),
-                                parent_token_ids.data(),
-                                parent_token_ids.data(),
-                                thrust::equal_to<size_type>{},
-                                thrust::maximum<size_type>{});
-
-  // scatter to restore the original order.
-  {
-    rmm::device_uvector<size_type> temp_storage(num_tokens, stream);
-    thrust::scatter(rmm::exec_policy(stream),
-                    parent_token_ids.begin(),
-                    parent_token_ids.end(),
-                    initial_order.begin(),
-                    temp_storage.begin());
-    thrust::copy(
-      rmm::exec_policy(stream), temp_storage.begin(), temp_storage.end(), parent_token_ids.begin());
-  }
-
-  rmm::device_uvector<size_type> node_ids_gpu(num_tokens, stream);
-  thrust::exclusive_scan(
-    rmm::exec_policy(stream), is_node_it, is_node_it + num_tokens, node_ids_gpu.begin());
-
-  rmm::device_uvector<NodeIndexT> parent_node_ids(num_nodes, stream, mr);
-  auto parent_node_ids_it = thrust::make_transform_iterator(
-    parent_token_ids.begin(),
-    [node_ids_gpu = node_ids_gpu.begin()] __device__(size_type const pid) -> NodeIndexT {
-      return pid < 0 ? parent_node_sentinel : node_ids_gpu[pid];
-    });
-  auto parent_node_ids_end = thrust::copy_if(rmm::exec_policy(stream),
-                                             parent_node_ids_it,
-                                             parent_node_ids_it + parent_token_ids.size(),
-                                             tokens.begin(),
-                                             parent_node_ids.begin(),
-                                             is_node);
-  CUDF_EXPECTS(parent_node_ids_end - parent_node_ids.begin() == num_nodes,
-               "parent node id gather mismatch");
-
   return {std::move(node_categories),
           std::move(parent_node_ids),
           std::move(node_levels),
@@ -312,38 +401,45 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
   using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
   using hash_map_type =
     cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
-  auto num_nodes = d_tree.node_categories.size();
+
+  auto const num_nodes  = d_tree.node_categories.size();
+  auto const num_fields = thrust::count(rmm::exec_policy(stream),
+                                        d_tree.node_categories.begin(),
+                                        d_tree.node_categories.end(),
+                                        node_t::NC_FN);
 
   constexpr size_type empty_node_index_sentinel = -1;
-  hash_map_type key_map{compute_hash_table_size(num_nodes),  // TODO reduce oversubscription
+  hash_map_type key_map{compute_hash_table_size(num_fields, 40),  // 40% occupancy in hash map
                         cuco::sentinel::empty_key{empty_node_index_sentinel},
                         cuco::sentinel::empty_value{empty_node_index_sentinel},
                         hash_table_allocator_type{default_allocator<char>{}, stream},
                         stream.value()};
-  auto d_hasher = [d_input          = d_input.data(),
-                   node_range_begin = d_tree.node_range_begin.data(),
-                   node_range_end   = d_tree.node_range_end.data()] __device__(auto node_id) {
+  auto const d_hasher = [d_input          = d_input.data(),
+                         node_range_begin = d_tree.node_range_begin.data(),
+                         node_range_end   = d_tree.node_range_end.data()] __device__(auto node_id) {
     auto const field_name = cudf::string_view(d_input + node_range_begin[node_id],
                                               node_range_end[node_id] - node_range_begin[node_id]);
     return cudf::detail::default_hash<cudf::string_view>{}(field_name);
   };
-  auto d_equal = [d_input          = d_input.data(),
-                  node_range_begin = d_tree.node_range_begin.data(),
-                  node_range_end   = d_tree.node_range_end.data()] __device__(auto node_id1,
-                                                                            auto node_id2) {
+  auto const d_equal = [d_input          = d_input.data(),
+                        node_range_begin = d_tree.node_range_begin.data(),
+                        node_range_end   = d_tree.node_range_end.data()] __device__(auto node_id1,
+                                                                                  auto node_id2) {
     auto const field_name1 = cudf::string_view(
       d_input + node_range_begin[node_id1], node_range_end[node_id1] - node_range_begin[node_id1]);
     auto const field_name2 = cudf::string_view(
       d_input + node_range_begin[node_id2], node_range_end[node_id2] - node_range_begin[node_id2]);
     return field_name1 == field_name2;
   };
-  auto is_field_name_node = [node_categories = d_tree.node_categories.data()] __device__(
-                              auto node_id) { return node_categories[node_id] == node_t::NC_FN; };
   // key-value pairs: uses node_id itself as node_type. (unique node_id for a field name due to
   // hashing)
-  auto iter = cudf::detail::make_counting_transform_iterator(
+  auto const iter = cudf::detail::make_counting_transform_iterator(
     0, [] __device__(size_type i) { return cuco::make_pair(i, i); });
 
+  auto const is_field_name_node = [node_categories =
+                                     d_tree.node_categories.data()] __device__(auto node_id) {
+    return node_categories[node_id] == node_t::NC_FN;
+  };
   key_map.insert_if(iter,
                     iter + num_nodes,
                     thrust::counting_iterator<size_type>(0),  // stencil
@@ -351,9 +447,10 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
                     d_hasher,
                     d_equal,
                     stream.value());
-  auto get_hash_value =
+
+  auto const get_hash_value =
     [key_map = key_map.get_device_view(), d_hasher, d_equal] __device__(auto node_id) -> size_type {
-    auto it = key_map.find(node_id, d_hasher, d_equal);
+    auto const it = key_map.find(node_id, d_hasher, d_equal);
     return (it == key_map.end()) ? size_type{0} : it->second.load(cuda::std::memory_order_relaxed);
   };
 
@@ -373,211 +470,225 @@ rmm::device_uvector<size_type> hash_node_type_with_field_name(device_span<Symbol
   return node_type;
 }
 
-/**
- * @brief Translates sorted parent_node_ids to parent_indices with indices from scatter_indices
- *
- * @param scatter_indices The sorted order of parent_node_ids
- * @param parent_node_ids The sorted parent_node_ids
- * @param stream CUDA stream used for device memory operations and kernel launches
- * @return Translated parent_indices pointing to sorted node_ids positions
- */
-rmm::device_uvector<NodeIndexT> translate_sorted_parent_node_indices(
-  device_span<size_type const> scatter_indices,
+// Two level hashing algorithm
+// 1. Convert node_category+fieldname to node_type. (passed as argument)
+//   a. Create a hashmap to hash field name and assign unique node id as values.
+//   b. Convert the node categories to node types.
+//      Node type is defined as node category enum value if it is not a field node,
+//      otherwise it is the unique node id assigned by the hashmap (value shifted by #NUM_CATEGORY).
+// 2. Set operation on entire path of each node
+//   a. Create a hash map with hash of {node_level, node_type} of its node and the entire parent
+//      until root.
+//   b. While creating hashmap, transform node id to unique node ids that are inserted into the
+//      hash map. This mimicks set operation with hash map. This unique node ids are set ids.
+//   c. Return this converted set ids, which are the hash map keys/values, and unique set ids.
+std::pair<rmm::device_uvector<size_type>, rmm::device_uvector<size_type>> hash_node_path(
+  device_span<TreeDepthT const> node_levels,
+  device_span<size_type const> node_type,
   device_span<NodeIndexT const> parent_node_ids,
-  rmm::cuda_stream_view stream)
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  auto const num_nodes      = scatter_indices.size();
-  auto const gather_indices = cudf::detail::scatter_to_gather(
-    scatter_indices.begin(), scatter_indices.end(), num_nodes, stream);
+  auto const num_nodes = parent_node_ids.size();
+  rmm::device_uvector<size_type> col_id(num_nodes, stream, mr);
 
-  rmm::device_uvector<NodeIndexT> parent_indices(num_nodes, stream);
-  // gather, except parent sentinels
-  thrust::transform(rmm::exec_policy(stream),
-                    parent_node_ids.begin(),
-                    parent_node_ids.end(),
-                    parent_indices.begin(),
-                    [gather_indices = gather_indices.data()] __device__(auto parent_node_id) {
-                      return (parent_node_id == parent_node_sentinel)
-                               ? parent_node_sentinel
-                               : gather_indices[parent_node_id];
-                    });
-  return parent_indices;
-};
+  using hash_table_allocator_type = rmm::mr::stream_allocator_adaptor<default_allocator<char>>;
+  using hash_map_type =
+    cuco::static_map<size_type, size_type, cuda::thread_scope_device, hash_table_allocator_type>;
+
+  constexpr size_type empty_node_index_sentinel = -1;
+  hash_map_type key_map{compute_hash_table_size(num_nodes),  // TODO reduce oversubscription
+                        cuco::sentinel::empty_key{empty_node_index_sentinel},
+                        cuco::sentinel::empty_value{empty_node_index_sentinel},
+                        cuco::sentinel::erased_key{-2},
+                        hash_table_allocator_type{default_allocator<char>{}, stream},
+                        stream.value()};
+  // path compression is not used since extra writes make all map operations slow.
+  auto const d_hasher = [node_level      = node_levels.begin(),
+                         node_type       = node_type.begin(),
+                         parent_node_ids = parent_node_ids.begin()] __device__(auto node_id) {
+    auto hash =
+      cudf::detail::hash_combine(cudf::detail::default_hash<TreeDepthT>{}(node_level[node_id]),
+                                 cudf::detail::default_hash<size_type>{}(node_type[node_id]));
+    node_id = parent_node_ids[node_id];
+    while (node_id != parent_node_sentinel) {
+      hash = cudf::detail::hash_combine(
+        hash, cudf::detail::default_hash<TreeDepthT>{}(node_level[node_id]));
+      hash = cudf::detail::hash_combine(
+        hash, cudf::detail::default_hash<size_type>{}(node_type[node_id]));
+      node_id = parent_node_ids[node_id];
+    }
+    return hash;
+  };
+
+  rmm::device_uvector<hash_value_type> node_hash(num_nodes, stream);
+  thrust::tabulate(rmm::exec_policy(stream), node_hash.begin(), node_hash.end(), d_hasher);
+  auto const d_hashed_cache = [node_hash = node_hash.begin()] __device__(auto node_id) {
+    return node_hash[node_id];
+  };
+
+  auto const d_equal = [node_level      = node_levels.begin(),
+                        node_type       = node_type.begin(),
+                        parent_node_ids = parent_node_ids.begin(),
+                        d_hashed_cache] __device__(auto node_id1, auto node_id2) {
+    if (node_id1 == node_id2) return true;
+    if (d_hashed_cache(node_id1) != d_hashed_cache(node_id2)) return false;
+    auto const is_equal_level = [node_level, node_type](auto node_id1, auto node_id2) {
+      if (node_id1 == node_id2) return true;
+      return node_level[node_id1] == node_level[node_id2] and
+             node_type[node_id1] == node_type[node_id2];
+    };
+    // if both nodes have same node types at all levels, it will check until it has common parent
+    // or root.
+    while (node_id1 != parent_node_sentinel and node_id2 != parent_node_sentinel and
+           node_id1 != node_id2 and is_equal_level(node_id1, node_id2)) {
+      node_id1 = parent_node_ids[node_id1];
+      node_id2 = parent_node_ids[node_id2];
+    }
+    return node_id1 == node_id2;
+  };
+
+  // insert and convert node ids to unique set ids
+  auto const num_inserted = thrust::count_if(
+    rmm::exec_policy(stream),
+    thrust::make_counting_iterator<size_type>(0),
+    thrust::make_counting_iterator<size_type>(num_nodes),
+    [d_hashed_cache,
+     d_equal,
+     view       = key_map.get_device_mutable_view(),
+     uq_node_id = col_id.begin()] __device__(auto node_id) mutable {
+      auto it = view.insert_and_find(cuco::make_pair(node_id, node_id), d_hashed_cache, d_equal);
+      uq_node_id[node_id] = (it.first)->first.load(cuda::std::memory_order_relaxed);
+      return it.second;
+    });
+
+  auto const num_columns = num_inserted;  // key_map.get_size() is not updated.
+  rmm::device_uvector<size_type> unique_keys(num_columns, stream);
+  key_map.retrieve_all(unique_keys.begin(), thrust::make_discard_iterator(), stream.value());
+
+  return {std::move(col_id), std::move(unique_keys)};
+}
 
 /**
- * @brief Generates column id and parent column id for each node from the node_level sorted inputs
+ * @brief Generates column id and parent column id for each node
  *
- * 4. Per-Level Processing: Propagate parent node ids for each level.
- *   For each level,
- *     a. gather col_id from previous level results. input=col_id, gather_map is parent_indices.
- *     b. stable sort by {parent_col_id, node_type}
- *     c. scan sum of unique {parent_col_id, node_type}
- *     d. scatter the col_id back to stable node_level order (using scatter_indices)
+ * 1. Generate col_id:
+ *    a. Set operation on entire path of each node, translate each node id to set id.
+ *       (two level hashing)
+ *    b. gather unique set ids.
+ *    c. sort and use binary search to generate column ids.
+ *    d. Translate parent node ids to parent column ids.
  *
- * pre-condition: All input arguments are stable sorted by node_level (stable in node_id order)
- * post-condition: Returned column_id, parent_col_id are level sorted.
- * @param node_type Unique id to identify node type, field with different name has different id.
- * @param parent_indices Parent node indices in the sorted node_level order
- * @param d_level_boundaries The boundaries of each level in the sorted node_level order
+ * All inputs and outputs are in node_id order.
+ * @param d_input JSON string in device memory
+ * @param d_tree Tree representation of the JSON
  * @param stream CUDA stream used for device memory operations and kernel launches
  * @param mr Device memory resource used to allocate the returned column's device memory
  * @return column_id, parent_column_id
  */
 std::pair<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<NodeIndexT>> generate_column_id(
-  device_span<size_type> node_type,        // level sorted
-  device_span<NodeIndexT> parent_indices,  // level sorted
-  device_span<size_type const> d_level_boundaries,
+  device_span<SymbolT const> d_input,
+  tree_meta_t const& d_tree,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
+  auto const num_nodes = d_tree.node_categories.size();
 
-  auto const num_nodes = node_type.size();
-  rmm::device_uvector<NodeIndexT> col_id(num_nodes, stream, mr);
-  rmm::device_uvector<NodeIndexT> parent_col_id(num_nodes, stream);
-  if (num_nodes == 0) { return {std::move(col_id), std::move(parent_col_id)}; }
-  rmm::device_uvector<size_type> scatter_indices(num_nodes, stream);
-  thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end());
-  // scatter 1 to level_boundaries alone, useful for scan later
-  thrust::scatter(rmm::exec_policy(stream),
-                  thrust::make_constant_iterator(1),
-                  thrust::make_constant_iterator(1) + d_level_boundaries.size() - 1,
-                  d_level_boundaries.begin(),
-                  col_id.begin());
-  auto level_boundaries = cudf::detail::make_std_vector_async(d_level_boundaries, stream);
-  // Initialize First level node's node col_id to 0
-  thrust::fill(rmm::exec_policy(stream), col_id.begin(), col_id.begin() + level_boundaries[0], 0);
-  // Initialize First level node's parent_col_id to parent_node_sentinel sentinel
-  thrust::fill(rmm::exec_policy(stream),
-               parent_col_id.begin(),
-               parent_col_id.begin() + level_boundaries[0],
-               parent_node_sentinel);
-
-  // Per-level processing
-  auto const num_levels = level_boundaries.size();
-  for (size_t level = 1; level < num_levels; level++) {
-    // Gather the each node's parent's column id for the nodes of the current level
-    thrust::gather(rmm::exec_policy(stream),
-                   parent_indices.data() + level_boundaries[level - 1],
-                   parent_indices.data() + level_boundaries[level],
-                   col_id.data(),
-                   parent_col_id.data() + level_boundaries[level - 1]);
-
-    // To invoke Radix sort for keys {parent_col_id, node_type} instead of merge sort,
-    // we need to split to 2 Radix sorts.
-    // Secondary sort on node_type
-
-    thrust::stable_sort_by_key(
-      rmm::exec_policy(stream),
-      node_type.data() + level_boundaries[level - 1],
-      node_type.data() + level_boundaries[level],
-      thrust::make_zip_iterator(parent_col_id.begin() + level_boundaries[level - 1],
-                                scatter_indices.begin()));
-    // Primary sort on parent_col_id
-    thrust::stable_sort_by_key(
-      rmm::exec_policy(stream),
-      parent_col_id.begin() + level_boundaries[level - 1],
-      parent_col_id.begin() + level_boundaries[level],
-      thrust::make_zip_iterator(node_type.data() + level_boundaries[level - 1],
-                                scatter_indices.begin()));
-
-    auto start_it = thrust::make_zip_iterator(parent_col_id.begin() + level_boundaries[level - 1],
-                                              node_type.data() + level_boundaries[level - 1]);
-    auto adjacent_pair_it = thrust::make_zip_iterator(start_it - 1, start_it);
-    // Compares two adjacent items, beginning with the first and second item from the current level.
-    // Writes flags to the index of the rhs item.
-    // First index holds next col_id from previous level.
-    thrust::transform(rmm::exec_policy(stream),
-                      adjacent_pair_it + 1,
-                      adjacent_pair_it + level_boundaries[level] - level_boundaries[level - 1],
-                      col_id.data() + level_boundaries[level - 1] + 1,
-                      [] __device__(auto adjacent_pair) -> size_type {
-                        auto const lhs = thrust::get<0>(adjacent_pair);
-                        auto const rhs = thrust::get<1>(adjacent_pair);
-                        return lhs != rhs ? 1 : 0;
-                      });
-
-    // includes previous level last col_id to continue the index.
-    thrust::inclusive_scan(rmm::exec_policy(stream),
-                           col_id.data() + level_boundaries[level - 1],
-                           col_id.data() + level_boundaries[level] + (level != num_levels - 1),
-                           // +1 only for not-last-levels, for next level start col_id
-                           col_id.data() + level_boundaries[level - 1]);
-
-    // scatter to restore original order.
-    auto const num_nodes_per_level = level_boundaries[level] - level_boundaries[level - 1];
-    {
-      rmm::device_uvector<NodeIndexT> tmp_col_id(num_nodes_per_level, stream);
-      rmm::device_uvector<NodeIndexT> tmp_parent_col_id(num_nodes_per_level, stream);
-      thrust::scatter(rmm::exec_policy(stream),
-                      thrust::make_zip_iterator(col_id.begin() + level_boundaries[level - 1],
-                                                parent_col_id.data() + level_boundaries[level - 1]),
-                      thrust::make_zip_iterator(col_id.begin() + level_boundaries[level],
-                                                parent_col_id.data() + level_boundaries[level]),
-                      scatter_indices.begin(),
-                      thrust::make_zip_iterator(tmp_col_id.begin(), tmp_parent_col_id.begin()));
-      thrust::copy(rmm::exec_policy(stream),
-                   tmp_col_id.begin(),
-                   tmp_col_id.end(),
-                   col_id.begin() + level_boundaries[level - 1]);
-      thrust::copy(rmm::exec_policy(stream),
-                   tmp_parent_col_id.begin(),
-                   tmp_parent_col_id.end(),
-                   parent_col_id.begin() + level_boundaries[level - 1]);
-    }
-    thrust::sequence(rmm::exec_policy(stream),
-                     scatter_indices.begin(),
-                     scatter_indices.begin() + num_nodes_per_level);
-  }
+  // Two level hashing:
+  //   one for field names -> node_type and,
+  //   another for {node_level, node_category} + field hash for the entire path
+  //    which is {node_level, node_type} recursively using parent_node_id
+  auto [col_id, unique_keys] = [&]() {
+    // Convert node_category + field_name to node_type.
+    rmm::device_uvector<size_type> node_type =
+      hash_node_type_with_field_name(d_input, d_tree, stream);
+
+    // hash entire path from node to root.
+    return hash_node_path(d_tree.node_levels, node_type, d_tree.parent_node_ids, stream, mr);
+  }();
+
+  thrust::sort(rmm::exec_policy(stream), unique_keys.begin(), unique_keys.end());
+  thrust::lower_bound(rmm::exec_policy(stream),
+                      unique_keys.begin(),
+                      unique_keys.end(),
+                      col_id.begin(),
+                      col_id.end(),
+                      col_id.begin());
+
+  rmm::device_uvector<size_type> parent_col_id(num_nodes, stream, mr);
+  thrust::transform(rmm::exec_policy(stream),
+                    d_tree.parent_node_ids.begin(),
+                    d_tree.parent_node_ids.end(),
+                    parent_col_id.begin(),
+                    [col_id = col_id.begin()] __device__(auto node_id) {
+                      return node_id >= 0 ? col_id[node_id] : parent_node_sentinel;
+                    });
 
   return {std::move(col_id), std::move(parent_col_id)};
 }
 
 /**
  * @brief Computes row indices of each node in the hierarchy.
- * 5. Generate row_offset.
- *   a. stable_sort by parent_col_id.
- *   b. scan_by_key {parent_col_id} (required only on nodes who's parent is list)
- *   c. propagate to non-list leaves from parent list node by recursion
+ * 2. Generate row_offset.
+ *   a. Extract only list children
+ *   b. stable_sort by parent_col_id.
+ *   c. scan_by_key {parent_col_id} (done only on nodes who's parent is list)
+ *   d. propagate to non-list leaves from parent list node by recursion
  *
  * pre-condition:
- *  scatter_indices is a sequence, representing node_id.
  *  d_tree.node_categories, d_tree.parent_node_ids, parent_col_id are in order of node_id.
  * post-condition: row_offsets is in order of node_id.
- *  parent_col_id and scatter_indices are sorted by parent_col_id. (unused after this function)
- * @param scatter_indices node_id
+ *  parent_col_id is moved and reused inside this function.
  * @param parent_col_id parent node's column id
  * @param d_tree Tree representation of the JSON string
  * @param stream CUDA stream used for device memory operations and kernel launches.
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return row_offsets
  */
-rmm::device_uvector<size_type> compute_row_offsets(device_span<size_type> scatter_indices,
-                                                   rmm::device_uvector<NodeIndexT>&& parent_col_id,
-                                                   tree_meta_t& d_tree,
+rmm::device_uvector<size_type> compute_row_offsets(rmm::device_uvector<NodeIndexT>&& parent_col_id,
+                                                   tree_meta_t const& d_tree,
                                                    rmm::cuda_stream_view stream,
                                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   auto const num_nodes = d_tree.node_categories.size();
-  // TODO generate scatter_indices sequences here itself
-  thrust::stable_sort_by_key(
-    rmm::exec_policy(stream), parent_col_id.begin(), parent_col_id.end(), scatter_indices.begin());
+
+  rmm::device_uvector<size_type> scatter_indices(num_nodes, stream);
+  thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end());
+
+  // Extract only list children. (nodes who's parent is a list/root)
+  auto const list_parent_end =
+    thrust::remove_if(rmm::exec_policy(stream),
+                      thrust::make_zip_iterator(parent_col_id.begin(), scatter_indices.begin()),
+                      thrust::make_zip_iterator(parent_col_id.end(), scatter_indices.end()),
+                      d_tree.parent_node_ids.begin(),
+                      [node_categories = d_tree.node_categories.begin()] __device__(auto pnid) {
+                        return !(pnid == parent_node_sentinel || node_categories[pnid] == NC_LIST);
+                      });
+  auto const num_list_parent = thrust::distance(
+    thrust::make_zip_iterator(parent_col_id.begin(), scatter_indices.begin()), list_parent_end);
+
+  thrust::stable_sort_by_key(rmm::exec_policy(stream),
+                             parent_col_id.begin(),
+                             parent_col_id.begin() + num_list_parent,
+                             scatter_indices.begin());
+
   rmm::device_uvector<size_type> row_offsets(num_nodes, stream, mr);
   // TODO is it possible to generate list child_offsets too here?
-  thrust::exclusive_scan_by_key(
-    rmm::exec_policy(stream),
-    parent_col_id.begin(),  // TODO: is there any way to limit this to list parents alone?
-    parent_col_id.end(),
-    thrust::make_constant_iterator<size_type>(1),
-    row_offsets.begin());
+  // write only 1st child offset to parent node id child_offsets?
+  thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
+                                parent_col_id.begin(),
+                                parent_col_id.begin() + num_list_parent,
+                                thrust::make_constant_iterator<size_type>(1),
+                                row_offsets.begin());
 
   // Using scatter instead of sort.
   auto& temp_storage = parent_col_id;  // reuse parent_col_id as temp storage
   thrust::scatter(rmm::exec_policy(stream),
                   row_offsets.begin(),
-                  row_offsets.end(),
+                  row_offsets.begin() + num_list_parent,
                   scatter_indices.begin(),
                   temp_storage.begin());
   row_offsets = std::move(temp_storage);
@@ -601,126 +712,37 @@ rmm::device_uvector<size_type> compute_row_offsets(device_span<size_type> scatte
     },
     [node_categories = d_tree.node_categories.data(),
      parent_node_ids = d_tree.parent_node_ids.begin()] __device__(size_type node_id) {
-      auto parent_node_id = parent_node_ids[node_id];
+      auto const parent_node_id = parent_node_ids[node_id];
       return parent_node_id != parent_node_sentinel and
              !(node_categories[parent_node_id] == node_t::NC_LIST);
     });
   return row_offsets;
 }
 
-/**
-@note
-This algorithm assigns a unique column id to each node in the tree.
-The row offset is the row index of the node in that column id.
-Algorithm:
-1. Convert node_category+fieldname to node_type.
-  a. Create a hashmap to hash field name and assign unique node id as values.
-  b. Convert the node categories to node types.
-     Node type is defined as node category enum value if it is not a field node,
-     otherwise it is the unique node id assigned by the hashmap (value shifted by #NUM_CATEGORY).
-2. Preprocessing: Translate parent node ids after sorting by level.
-  a. sort by level
-  b. get gather map of sorted indices
-  c. translate parent_node_ids to new sorted indices
-3. Find level boundaries.
-   copy_if index of first unique values of sorted levels.
-4. Per-Level Processing: Propagate parent node ids for each level.
-  For each level,
-    a. gather col_id from previous level results. input=col_id, gather_map is parent_indices.
-    b. stable sort by {parent_col_id, node_type}
-    c. scan sum of unique {parent_col_id, node_type}
-    d. scatter the col_id back to stable node_level order (using scatter_indices)
-  Restore original node_id order
-5. Generate row_offset.
-  a. stable_sort by parent_col_id.
-  b. scan_by_key {parent_col_id} (required only on nodes whose parent is a list)
-  c. propagate to non-list leaves from parent list node by recursion
-**/
+// This algorithm assigns a unique column id to each node in the tree.
+// The row offset is the row index of the node in that column id.
+// Algorithm:
+// 1. Generate col_id:
+//   a. Set operation on entire path of each node, translate each node id to set id.
+//   b. gather unique set ids.
+//   c. sort and use binary search to generate column ids.
+//   d. Translate parent node ids to parent column ids.
+// 2. Generate row_offset.
+//   a. filter only list childs
+//   a. stable_sort by parent_col_id.
+//   b. scan_by_key {parent_col_id} (done only on nodes whose parent is a list)
+//   c. propagate to non-list leaves from parent list node by recursion
 std::tuple<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
 records_orient_tree_traversal(device_span<SymbolT const> d_input,
-                              tree_meta_t& d_tree,
+                              tree_meta_t const& d_tree,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  // 1. Convert node_category + field_name to node_type.
-
-  auto num_nodes = d_tree.node_categories.size();
-  rmm::device_uvector<size_type> node_type =
-    hash_node_type_with_field_name(d_input, d_tree, stream);
-  // TODO two-level hashing:  one for field names
-  // and another for {node-level, node_category} + field hash for the entire path
-
-  // 2. Preprocessing: Translate parent node ids after sorting by level.
-  //   a. sort by level
-  //   b. get gather map of sorted indices
-  //   c. translate parent_node_ids to sorted indices
-
-  rmm::device_uvector<size_type> scatter_indices(num_nodes, stream);
-  thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end());
-
-  rmm::device_uvector<NodeIndexT> parent_node_ids(d_tree.parent_node_ids, stream);  // make a copy
-  auto out_pid =
-    thrust::make_zip_iterator(scatter_indices.data(), parent_node_ids.data(), node_type.data());
-  // Uses cub radix sort. sort by level
-  thrust::stable_sort_by_key(rmm::exec_policy(stream),
-                             d_tree.node_levels.data(),
-                             d_tree.node_levels.data() + num_nodes,
-                             out_pid);
-
-  rmm::device_uvector<NodeIndexT> parent_indices =
-    translate_sorted_parent_node_indices(scatter_indices, parent_node_ids, stream);
-  // TODO optimize memory usage: parent_node_ids is no longer needed
-
-  // 3. Find level boundaries.
-  auto level_boundaries = [&]() {
-    if (d_tree.node_levels.is_empty()) return rmm::device_uvector<size_type>{0, stream};
-    // Already node_levels is sorted
-    auto max_level = d_tree.node_levels.back_element(stream);
-    rmm::device_uvector<size_type> level_boundaries(max_level + 1, stream);
-    // TODO try reduce_by_key
-    auto level_end =
-      thrust::copy_if(rmm::exec_policy(stream),
-                      thrust::make_counting_iterator<size_type>(1),
-                      thrust::make_counting_iterator<size_type>(num_nodes + 1),
-                      level_boundaries.begin(),
-                      [num_nodes, node_levels = d_tree.node_levels.begin()] __device__(auto index) {
-                        return index == num_nodes || node_levels[index] != node_levels[index - 1];
-                      });
-    CUDF_EXPECTS(thrust::distance(level_boundaries.begin(), level_end) == max_level + 1,
-                 "num_levels != max_level + 1");
-    return level_boundaries;
-  };
-
-  // 4. Per-Level Processing: Propagate parent node ids for each level.
-  auto [col_id, parent_col_id] = generate_column_id(node_type,       // level sorted
-                                                    parent_indices,  // level sorted
-                                                    level_boundaries(),
-                                                    stream,
-                                                    mr);
-
-  // restore original order of col_id, parent_col_id and used d_tree members
-  {
-    rmm::device_uvector<NodeIndexT> tmp_col_id(num_nodes, stream);
-    rmm::device_uvector<NodeIndexT> tmp_parent_col_id(num_nodes, stream);
-    rmm::device_uvector<TreeDepthT> tmp_node_levels(num_nodes, stream);
-    thrust::scatter(
-      rmm::exec_policy(stream),
-      thrust::make_zip_iterator(col_id.begin(), parent_col_id.begin(), d_tree.node_levels.begin()),
-      thrust::make_zip_iterator(col_id.end(), parent_col_id.end(), d_tree.node_levels.end()),
-      scatter_indices.begin(),
-      thrust::make_zip_iterator(
-        tmp_col_id.begin(), tmp_parent_col_id.begin(), tmp_node_levels.begin()));
-    col_id             = std::move(tmp_col_id);
-    parent_col_id      = std::move(tmp_parent_col_id);
-    d_tree.node_levels = std::move(tmp_node_levels);
-    thrust::sequence(rmm::exec_policy(stream), scatter_indices.begin(), scatter_indices.end());
-  }
+  auto [new_col_id, new_parent_col_id] = generate_column_id(d_input, d_tree, stream, mr);
 
-  // 5. Generate row_offset.
-  auto row_offsets =
-    compute_row_offsets(scatter_indices, std::move(parent_col_id), d_tree, stream, mr);
-  return std::tuple{std::move(col_id), std::move(row_offsets)};
+  auto row_offsets = compute_row_offsets(std::move(new_parent_col_id), d_tree, stream, mr);
+  return std::tuple{std::move(new_col_id), std::move(row_offsets)};
 }
 
 }  // namespace detail
diff --git a/cpp/src/io/json/nested_json.hpp b/cpp/src/io/json/nested_json.hpp
index 10d209b2ea6..35c09c89d8b 100644
--- a/cpp/src/io/json/nested_json.hpp
+++ b/cpp/src/io/json/nested_json.hpp
@@ -104,6 +104,9 @@ enum node_t : NodeT {
  */
 enum class json_col_t : char { ListColumn, StructColumn, StringColumn, Unknown };
 
+// Default name for a list's child column
+constexpr auto list_child_name{"element"};
+
 /**
  * @brief Intermediate representation of data from a nested JSON input
  */
@@ -319,7 +322,7 @@ tree_meta_t get_tree_representation(
 std::tuple<rmm::device_uvector<NodeIndexT>, rmm::device_uvector<size_type>>
 records_orient_tree_traversal(
   device_span<SymbolT const> d_input,
-  tree_meta_t& d_tree,
+  tree_meta_t const& d_tree,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
diff --git a/cpp/src/io/json/nested_json_gpu.cu b/cpp/src/io/json/nested_json_gpu.cu
index 5d60a564b9b..0c35930c2e4 100644
--- a/cpp/src/io/json/nested_json_gpu.cu
+++ b/cpp/src/io/json/nested_json_gpu.cu
@@ -1162,9 +1162,6 @@ void make_json_column(json_column& root_column,
   // Range of encapsulating function that parses to internal columnar data representation
   CUDF_FUNC_RANGE();
 
-  // Default name for a list's child column
-  std::string const list_child_name = "element";
-
   // Parse the JSON and get the token stream
   const auto [d_tokens_gpu, d_token_indices_gpu] = get_token_stream(d_input, options, stream, mr);
 
@@ -1286,7 +1283,7 @@ void make_json_column(json_column& root_column,
    * (b) a list, the selected child column corresponds to single child column of
    * the list column. In this case, the child column may not exist yet.
    */
-  auto get_selected_column = [&list_child_name](std::stack<tree_node>& current_data_path) {
+  auto get_selected_column = [](std::stack<tree_node>& current_data_path) {
     json_column* selected_col = current_data_path.top().current_selected_col;
 
     // If the node does not have a selected column yet
@@ -1543,7 +1540,7 @@ auto parsing_options(cudf::io::json_reader_options const& options)
 {
   auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'};
 
-  auto const stream     = cudf::default_stream_value;
+  auto const stream     = cudf::get_default_stream();
   parse_opts.dayfirst   = options.is_enabled_dayfirst();
   parse_opts.keepquotes = options.is_enabled_keep_quotes();
   parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
@@ -1680,7 +1677,8 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
       size_type num_rows = json_col.child_offsets.size();
       std::vector<column_name_info> column_names{};
       column_names.emplace_back("offsets");
-      column_names.emplace_back(json_col.child_columns.begin()->first);
+      column_names.emplace_back(
+        json_col.child_columns.empty() ? list_child_name : json_col.child_columns.begin()->first);
 
       rmm::device_uvector<json_column::row_offset_t> d_offsets =
         cudf::detail::make_device_uvector_async(json_col.child_offsets, stream, mr);
@@ -1688,12 +1686,15 @@ std::pair<std::unique_ptr<column>, std::vector<column_name_info>> json_column_to
         std::make_unique<column>(data_type{type_id::INT32}, num_rows, d_offsets.release());
       // Create children column
       auto [child_column, names] =
-        json_column_to_cudf_column(json_col.child_columns.begin()->second,
-                                   d_input,
-                                   options,
-                                   get_child_schema(json_col.child_columns.begin()->first),
-                                   stream,
-                                   mr);
+        json_col.child_columns.empty()
+          ? std::pair<std::unique_ptr<column>,
+                      std::vector<column_name_info>>{std::make_unique<column>(), {}}
+          : json_column_to_cudf_column(json_col.child_columns.begin()->second,
+                                       d_input,
+                                       options,
+                                       get_child_schema(json_col.child_columns.begin()->first),
+                                       stream,
+                                       mr);
       column_names.back().children      = names;
       auto [result_bitmask, null_count] = make_validity(json_col);
       return {make_lists_column(num_rows - 1,
diff --git a/cpp/src/io/json/reader_impl.cu b/cpp/src/io/json/reader_impl.cu
index 48b2af81fcd..4bbe91b61d2 100644
--- a/cpp/src/io/json/reader_impl.cu
+++ b/cpp/src/io/json/reader_impl.cu
@@ -26,6 +26,7 @@
 #include <io/utilities/type_conversion.hpp>
 
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/detail/utilities/visitor_overload.hpp>
 #include <cudf/groupby.hpp>
@@ -222,6 +223,7 @@ std::vector<uint8_t> ingest_raw_input(std::vector<std::unique_ptr<datasource>> c
                                       size_t range_size,
                                       size_t range_size_padded)
 {
+  CUDF_FUNC_RANGE();
   // Iterate through the user defined sources and read the contents into the local buffer
   size_t total_source_size = 0;
   for (const auto& source : sources) {
@@ -313,6 +315,7 @@ rmm::device_uvector<char> upload_data_to_device(json_reader_options const& reade
                                                 rmm::device_uvector<uint64_t>& rec_starts,
                                                 rmm::cuda_stream_view stream)
 {
+  CUDF_FUNC_RANGE();
   size_t end_offset = h_data.size();
 
   // Trim lines that are outside range
@@ -592,6 +595,7 @@ table_with_metadata read_json(std::vector<std::unique_ptr<datasource>>& sources,
                               rmm::cuda_stream_view stream,
                               rmm::mr::device_memory_resource* mr)
 {
+  CUDF_FUNC_RANGE();
   if (reader_opts.is_enabled_experimental()) {
     return experimental::read_json(sources, reader_opts, stream, mr);
   }
diff --git a/cpp/src/io/orc/dict_enc.cu b/cpp/src/io/orc/dict_enc.cu
index 0b5de26adfc..898df3ef0f9 100644
--- a/cpp/src/io/orc/dict_enc.cu
+++ b/cpp/src/io/orc/dict_enc.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "orc_common.hpp"
 #include "orc_gpu.hpp"
 
+#include <cudf/io/orc_types.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <io/utilities/block_utils.cuh>
 
diff --git a/cpp/src/io/orc/orc.hpp b/cpp/src/io/orc/orc.hpp
index 2018024f566..44882b71925 100644
--- a/cpp/src/io/orc/orc.hpp
+++ b/cpp/src/io/orc/orc.hpp
@@ -16,11 +16,10 @@
 
 #pragma once
 
-#include "orc_common.hpp"
-
 #include <cudf/column/column_device_view.cuh>
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/orc_metadata.hpp>
+#include <cudf/io/orc_types.hpp>
 #include <cudf/utilities/error.hpp>
 #include <io/comp/io_uncomp.hpp>
 
@@ -37,6 +36,9 @@
 namespace cudf {
 namespace io {
 namespace orc {
+
+static constexpr uint32_t block_header_size = 3;
+
 struct PostScript {
   uint64_t footerLength       = 0;     // the length of the footer section in bytes
   CompressionKind compression = NONE;  // the kind of generic compression used
diff --git a/cpp/src/io/orc/orc_gpu.hpp b/cpp/src/io/orc/orc_gpu.hpp
index c7a7a423cf2..1e4e36ee91c 100644
--- a/cpp/src/io/orc/orc_gpu.hpp
+++ b/cpp/src/io/orc/orc_gpu.hpp
@@ -19,8 +19,8 @@
 #include "timezone.cuh"
 
 #include "orc.hpp"
-#include "orc_common.hpp"
 
+#include <cudf/io/orc_types.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
diff --git a/cpp/src/io/orc/reader_impl.cu b/cpp/src/io/orc/reader_impl.cu
index 7fb83b2a24e..0623e35741d 100644
--- a/cpp/src/io/orc/reader_impl.cu
+++ b/cpp/src/io/orc/reader_impl.cu
@@ -379,8 +379,10 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
     device_span<device_span<uint8_t>> inflate_out_view{inflate_out.data(), num_compressed_blocks};
     switch (decompressor.compression()) {
       case compression_type::ZLIB:
-        // See https://github.com/rapidsai/cudf/issues/11812
-        if (false) {
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
+          gpuinflate(
+            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
+        } else {
           nvcomp::batched_decompress(nvcomp::compression_type::DEFLATE,
                                      inflate_in_view,
                                      inflate_out_view,
@@ -388,13 +390,12 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                                      max_uncomp_block_size,
                                      total_decomp_size,
                                      stream);
-        } else {
-          gpuinflate(
-            inflate_in_view, inflate_out_view, inflate_res, gzip_header_included::NO, stream);
         }
         break;
       case compression_type::SNAPPY:
-        if (nvcomp_integration::is_stable_enabled()) {
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::SNAPPY)) {
+          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
+        } else {
           nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
                                      inflate_in_view,
                                      inflate_out_view,
@@ -402,11 +403,13 @@ rmm::device_buffer reader::impl::decompress_stripe_data(
                                      max_uncomp_block_size,
                                      total_decomp_size,
                                      stream);
-        } else {
-          gpu_unsnap(inflate_in_view, inflate_out_view, inflate_res, stream);
         }
         break;
       case compression_type::ZSTD:
+        if (auto const reason = nvcomp::is_decompression_disabled(nvcomp::compression_type::ZSTD);
+            reason) {
+          CUDF_FAIL("Decompression error: " + reason.value());
+        }
         nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
                                    inflate_in_view,
                                    inflate_out_view,
@@ -522,8 +525,8 @@ void update_null_mask(cudf::detail::hostdevice_2dvector<gpu::ColumnDesc>& chunks
           parent_mask_len, mask_state::ALL_NULL, rmm::cuda_stream_view(stream), mr);
         auto merged_mask      = static_cast<bitmask_type*>(merged_null_mask.data());
         uint32_t* dst_idx_ptr = dst_idx.data();
-        // Copy child valid bits from child column to valid indexes, this will merge both child and
-        // parent null masks
+        // Copy child valid bits from child column to valid indexes, this will merge both child
+        // and parent null masks
         thrust::for_each(rmm::exec_policy(stream),
                          thrust::make_counting_iterator(0),
                          thrust::make_counting_iterator(0) + dst_idx.size(),
@@ -964,7 +967,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
     // Association between each ORC column and its cudf::column
     _col_meta.orc_col_map.emplace_back(_metadata.get_num_cols(), -1);
     std::vector<orc_column_meta> nested_col;
-    bool is_data_empty = false;
 
     // Get a list of column data types
     std::vector<data_type> column_types;
@@ -988,7 +990,6 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
       // Map each ORC column to its column
       _col_meta.orc_col_map[level][col.id] = column_types.size() - 1;
-      // TODO: Once MAP type is supported in cuDF, update this for MAP as well
       if (col_type == type_id::LIST or col_type == type_id::STRUCT) nested_col.emplace_back(col);
     }
 
@@ -1048,6 +1049,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
       size_t num_rowgroups    = 0;
       int stripe_idx          = 0;
 
+      bool is_level_data_empty = true;
       std::vector<std::pair<std::future<size_t>, size_t>> read_tasks;
       for (auto const& stripe_source_mapping : selected_stripes) {
         // Iterate through the source files selected stripes
@@ -1067,21 +1069,16 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                                                           stream_info,
                                                           level == 0);
 
-          if (total_data_size == 0) {
-            CUDF_EXPECTS(stripe_info->indexLength == 0, "Invalid index rowgroup stream data");
-            // In case ROW GROUP INDEX is not present and all columns are structs with no null
-            // stream, there is nothing to read at this level.
-            auto fn_check_dtype = [](auto dtype) { return dtype.id() == type_id::STRUCT; };
-            CUDF_EXPECTS(std::all_of(column_types.begin(), column_types.end(), fn_check_dtype),
-                         "Expected streams data within stripe");
-            is_data_empty = true;
-          }
+          auto const is_stripe_data_empty = total_data_size == 0;
+          if (not is_stripe_data_empty) { is_level_data_empty = false; }
+          CUDF_EXPECTS(not is_stripe_data_empty or stripe_info->indexLength == 0,
+                       "Invalid index rowgroup stream data");
 
           stripe_data.emplace_back(total_data_size, stream);
           auto dst_base = static_cast<uint8_t*>(stripe_data.back().data());
 
           // Coalesce consecutive streams into one read
-          while (not is_data_empty and stream_count < stream_info.size()) {
+          while (not is_stripe_data_empty and stream_count < stream_info.size()) {
             const auto d_dst  = dst_base + stream_info[stream_count].dst_pos;
             const auto offset = stream_info[stream_count].offset;
             auto len          = stream_info[stream_count].length;
@@ -1159,7 +1156,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
             if (chunk.type_kind == orc::TIMESTAMP) {
               chunk.timestamp_type_id = _timestamp_type.id();
             }
-            if (not is_data_empty) {
+            if (not is_stripe_data_empty) {
               for (int k = 0; k < gpu::CI_NUM_STREAMS; k++) {
                 chunk.streams[k] = dst_base + stream_info[chunk.strm_id[k]].dst_pos;
               }
@@ -1196,7 +1193,8 @@ table_with_metadata reader::impl::read(size_type skip_rows,
                          });
         }
         // Setup row group descriptors if using indexes
-        if (_metadata.per_file_metadata[0].ps.compression != orc::NONE and not is_data_empty) {
+        if (_metadata.per_file_metadata[0].ps.compression != orc::NONE and
+            not is_level_data_empty) {
           auto decomp_data = decompress_stripe_data(chunks,
                                                     stripe_data,
                                                     *_metadata.per_file_metadata[0].decompressor,
@@ -1239,7 +1237,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
           out_buffers[level].emplace_back(column_types[i], n_rows, is_nullable, stream, _mr);
         }
 
-        if (not is_data_empty) {
+        if (not is_level_data_empty) {
           decode_stream_data(chunks,
                              num_dict_entries,
                              skip_rows,
@@ -1253,7 +1251,7 @@ table_with_metadata reader::impl::read(size_type skip_rows,
 
         // Extract information to process nested child columns
         if (nested_col.size()) {
-          if (not is_data_empty) {
+          if (not is_level_data_empty) {
             scan_null_counts(chunks, null_count_prefix_sums[level], stream);
           }
           row_groups.device_to_host(stream, true);
diff --git a/cpp/src/io/orc/stats_enc.cu b/cpp/src/io/orc/stats_enc.cu
index bbff689082e..1303dd126ef 100644
--- a/cpp/src/io/orc/stats_enc.cu
+++ b/cpp/src/io/orc/stats_enc.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "orc_common.hpp"
 #include "orc_gpu.hpp"
 
+#include <cudf/io/orc_types.hpp>
 #include <io/utilities/block_utils.cuh>
 
 #include <rmm/cuda_stream_view.hpp>
diff --git a/cpp/src/io/orc/stripe_data.cu b/cpp/src/io/orc/stripe_data.cu
index c9cc0f04b3c..bf883986c84 100644
--- a/cpp/src/io/orc/stripe_data.cu
+++ b/cpp/src/io/orc/stripe_data.cu
@@ -14,12 +14,13 @@
  * limitations under the License.
  */
 
-#include <cub/cub.cuh>
+#include "orc_gpu.hpp"
+
+#include <cudf/io/orc_types.hpp>
 #include <io/utilities/block_utils.cuh>
-#include <rmm/cuda_stream_view.hpp>
 
-#include "orc_common.hpp"
-#include "orc_gpu.hpp"
+#include <cub/cub.cuh>
+#include <rmm/cuda_stream_view.hpp>
 
 namespace cudf {
 namespace io {
diff --git a/cpp/src/io/orc/stripe_enc.cu b/cpp/src/io/orc/stripe_enc.cu
index ef4bdd421fb..9032e3d2502 100644
--- a/cpp/src/io/orc/stripe_enc.cu
+++ b/cpp/src/io/orc/stripe_enc.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "orc_common.hpp"
 #include "orc_gpu.hpp"
 
+#include <cudf/io/orc_types.hpp>
 #include <io/comp/nvcomp_adapter.hpp>
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/config_utils.hpp>
@@ -1179,8 +1179,9 @@ __global__ void __launch_bounds__(256)
   num_blocks = (ss.stream_size > 0) ? (ss.stream_size - 1) / comp_blk_size + 1 : 1;
   for (uint32_t b = t; b < num_blocks; b += 256) {
     uint32_t blk_size = min(comp_blk_size, ss.stream_size - min(b * comp_blk_size, ss.stream_size));
-    inputs[ss.first_block + b]  = {src + b * comp_blk_size, blk_size};
-    auto const dst_offset       = b * (padded_block_header_size + padded_comp_block_size);
+    inputs[ss.first_block + b] = {src + b * comp_blk_size, blk_size};
+    auto const dst_offset =
+      padded_block_header_size + b * (padded_block_header_size + padded_comp_block_size);
     outputs[ss.first_block + b] = {dst + dst_offset, max_comp_blk_size};
     results[ss.first_block + b] = {0, compression_status::FAILURE};
   }
@@ -1234,7 +1235,9 @@ __global__ void __launch_bounds__(1024)
                        ? results[ss.first_block + b].bytes_written
                        : src_len;
       uint32_t blk_size24{};
-      if (results[ss.first_block + b].status == compression_status::SUCCESS) {
+      // Only use the compressed block if it's smaller than the uncompressed
+      // If compression failed, dst_len == src_len, so the uncompressed block will be used
+      if (src_len < dst_len) {
         // Copy from uncompressed source
         src                                       = inputs[ss.first_block + b].data();
         results[ss.first_block + b].bytes_written = src_len;
@@ -1332,11 +1335,11 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
 
   if (compression == SNAPPY) {
     try {
-      if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) {
+      if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
+        gpu_snap(comp_in, comp_out, comp_res, stream);
+      } else {
         nvcomp::batched_compress(
           nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
-      } else {
-        gpu_snap(comp_in, comp_out, comp_res, stream);
       }
     } catch (...) {
       // There was an error in compressing so set an error status for each block
@@ -1348,12 +1351,18 @@ void CompressOrcDataStreams(uint8_t* compressed_data,
       // Since SNAPPY is the default compression (may not be explicitly requested), fall back to
       // writing without compression
     }
-  } else if (compression == ZLIB and
-             nvcomp::is_compression_enabled(nvcomp::compression_type::DEFLATE)) {
+  } else if (compression == ZLIB) {
+    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::DEFLATE);
+        reason) {
+      CUDF_FAIL("Compression error: " + reason.value());
+    }
     nvcomp::batched_compress(
       nvcomp::compression_type::DEFLATE, comp_in, comp_out, comp_res, stream);
-  } else if (compression == ZSTD and
-             nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) {
+  } else if (compression == ZSTD) {
+    if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
+        reason) {
+      CUDF_FAIL("Compression error: " + reason.value());
+    }
     nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
   } else if (compression != NONE) {
     CUDF_FAIL("Unsupported compression type");
diff --git a/cpp/src/io/orc/stripe_init.cu b/cpp/src/io/orc/stripe_init.cu
index bd65089810e..381a734021c 100644
--- a/cpp/src/io/orc/stripe_init.cu
+++ b/cpp/src/io/orc/stripe_init.cu
@@ -14,9 +14,9 @@
  * limitations under the License.
  */
 
-#include "orc_common.hpp"
 #include "orc_gpu.hpp"
 
+#include <cudf/io/orc_types.hpp>
 #include <io/utilities/block_utils.cuh>
 
 #include <cub/cub.cuh>
diff --git a/cpp/src/io/orc/timezone.cuh b/cpp/src/io/orc/timezone.cuh
index 2eb20af7898..9b98aa13bac 100644
--- a/cpp/src/io/orc/timezone.cuh
+++ b/cpp/src/io/orc/timezone.cuh
@@ -115,7 +115,7 @@ class timezone_table {
 
  public:
   // Safe to use the default stream, device_uvectors will not change after they are created empty
-  timezone_table() : ttimes{0, cudf::default_stream_value}, offsets{0, cudf::default_stream_value}
+  timezone_table() : ttimes{0, cudf::get_default_stream()}, offsets{0, cudf::get_default_stream()}
   {
   }
   timezone_table(int32_t gmt_offset,
diff --git a/cpp/src/io/orc/writer_impl.cu b/cpp/src/io/orc/writer_impl.cu
index a5e9e9da4cb..c0ae58a64d9 100644
--- a/cpp/src/io/orc/writer_impl.cu
+++ b/cpp/src/io/orc/writer_impl.cu
@@ -118,9 +118,9 @@ constexpr size_t compression_block_size(orc::CompressionKind compression)
   if (compression == orc::CompressionKind::NONE) { return 0; }
 
   auto const ncomp_type   = to_nvcomp_compression_type(compression);
-  auto const nvcomp_limit = nvcomp::is_compression_enabled(ncomp_type)
-                              ? nvcomp::compress_max_allowed_chunk_size(ncomp_type)
-                              : std::nullopt;
+  auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
+                              ? std::nullopt
+                              : nvcomp::compress_max_allowed_chunk_size(ncomp_type);
 
   constexpr size_t max_block_size = 256 * 1024;
   return std::min(nvcomp_limit.value_or(max_block_size), max_block_size);
@@ -537,7 +537,7 @@ constexpr size_t RLE_stream_size(TypeKind kind, size_t count)
 auto uncomp_block_alignment(CompressionKind compression_kind)
 {
   if (compression_kind == NONE or
-      not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) {
+      nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
     return 1u;
   }
 
@@ -547,7 +547,7 @@ auto uncomp_block_alignment(CompressionKind compression_kind)
 auto comp_block_alignment(CompressionKind compression_kind)
 {
   if (compression_kind == NONE or
-      not nvcomp::is_compression_enabled(to_nvcomp_compression_type(compression_kind))) {
+      nvcomp::is_compression_disabled(to_nvcomp_compression_type(compression_kind))) {
     return 1u;
   }
 
@@ -2161,7 +2161,8 @@ void writer::impl::write(table_view const& table)
 
   auto dec_chunk_sizes = decimal_chunk_sizes(orc_table, segmentation, stream);
 
-  auto const uncomp_block_align = uncomp_block_alignment(compression_kind_);
+  auto const uncompressed_block_align = uncomp_block_alignment(compression_kind_);
+  auto const compressed_block_align   = comp_block_alignment(compression_kind_);
   auto streams =
     create_streams(orc_table.columns, segmentation, decimal_column_sizes(dec_chunk_sizes.rg_sizes));
   auto enc_data = encode_columns(orc_table,
@@ -2169,7 +2170,7 @@ void writer::impl::write(table_view const& table)
                                  std::move(dec_chunk_sizes),
                                  segmentation,
                                  streams,
-                                 uncomp_block_align,
+                                 uncompressed_block_align,
                                  stream);
 
   // Assemble individual disparate column chunks into contiguous data streams
@@ -2187,9 +2188,9 @@ void writer::impl::write(table_view const& table)
     auto const max_compressed_block_size =
       max_compression_output_size(compression_kind_, compression_blocksize_);
     auto const padded_max_compressed_block_size =
-      util::round_up_unsafe<size_t>(max_compressed_block_size, uncomp_block_align);
+      util::round_up_unsafe<size_t>(max_compressed_block_size, compressed_block_align);
     auto const padded_block_header_size =
-      util::round_up_unsafe<size_t>(block_header_size, uncomp_block_align);
+      util::round_up_unsafe<size_t>(block_header_size, compressed_block_align);
 
     auto stream_output = [&]() {
       size_t max_stream_size = 0;
@@ -2238,7 +2239,7 @@ void writer::impl::write(table_view const& table)
                                   compression_kind_,
                                   compression_blocksize_,
                                   max_compressed_block_size,
-                                  comp_block_alignment(compression_kind_),
+                                  compressed_block_align,
                                   strm_descs,
                                   enc_data.streams,
                                   comp_results,
diff --git a/cpp/src/io/parquet/chunk_dict.cu b/cpp/src/io/parquet/chunk_dict.cu
index 671e34ac73d..999cad76d5d 100644
--- a/cpp/src/io/parquet/chunk_dict.cu
+++ b/cpp/src/io/parquet/chunk_dict.cu
@@ -14,7 +14,7 @@
  * limitations under the License.
  */
 
-#include <io/parquet/parquet_gpu.hpp>
+#include "parquet_gpu.cuh"
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
diff --git a/cpp/src/io/parquet/compact_protocol_writer.cpp b/cpp/src/io/parquet/compact_protocol_writer.cpp
index 28baad9c51c..f5ae262fa3f 100644
--- a/cpp/src/io/parquet/compact_protocol_writer.cpp
+++ b/cpp/src/io/parquet/compact_protocol_writer.cpp
@@ -150,7 +150,7 @@ size_t CompactProtocolWriter::write(const SchemaElement& s)
   // if (isset.STRING or isset.MAP or isset.LIST or isset.ENUM or isset.DECIMAL or isset.DATE or
   //    isset.TIME or isset.TIMESTAMP or isset.INTEGER or isset.UNKNOWN or isset.JSON or isset.BSON)
   //    {
-  if (isset.TIMESTAMP) { c.field_struct(10, s.logical_type); }
+  if (isset.TIMESTAMP or isset.TIME) { c.field_struct(10, s.logical_type); }
   return c.value();
 }
 
diff --git a/cpp/src/io/parquet/page_data.cu b/cpp/src/io/parquet/page_data.cu
index 531733a7df7..c580aa5bbc0 100644
--- a/cpp/src/io/parquet/page_data.cu
+++ b/cpp/src/io/parquet/page_data.cu
@@ -18,8 +18,10 @@
 #include <io/utilities/block_utils.cuh>
 #include <io/utilities/column_buffer.hpp>
 
+#include <cuda/std/tuple>
 #include <cudf/detail/utilities/assert.cuh>
 #include <cudf/detail/utilities/hash_functions.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/strings/string_view.hpp>
 #include <cudf/utilities/bit.hpp>
 
@@ -52,6 +54,8 @@ namespace io {
 namespace parquet {
 namespace gpu {
 
+namespace {
+
 struct page_state_s {
   const uint8_t* data_start;
   const uint8_t* data_end;
@@ -146,11 +150,18 @@ __device__ uint32_t InitLevelSection(page_state_s* s,
     s->initial_rle_value[lvl] = 0;
     s->lvl_start[lvl]         = cur;
   } else if (encoding == Encoding::RLE) {
-    if (cur + 4 < end) {
-      uint32_t run;
+    // V2 only uses RLE encoding, so only perform check here
+    if (s->page.def_lvl_bytes || s->page.rep_lvl_bytes) {
+      len = lvl == level_type::DEFINITION ? s->page.def_lvl_bytes : s->page.rep_lvl_bytes;
+    } else if (cur + 4 < end) {
       len = 4 + (cur[0]) + (cur[1] << 8) + (cur[2] << 16) + (cur[3] << 24);
       cur += 4;
-      run                     = get_vlq32(cur, end);
+    } else {
+      len      = 0;
+      s->error = 2;
+    }
+    if (!s->error) {
+      uint32_t run            = get_vlq32(cur, end);
       s->initial_rle_run[lvl] = run;
       if (!(run & 1)) {
         int v = (cur < end) ? cur[0] : 0;
@@ -163,9 +174,6 @@ __device__ uint32_t InitLevelSection(page_state_s* s,
       }
       s->lvl_start[lvl] = cur;
       if (cur > end) { s->error = 2; }
-    } else {
-      len      = 0;
-      s->error = 2;
     }
   } else if (encoding == Encoding::BIT_PACKED) {
     len                       = (s->page.num_input_values * level_bits + 7) >> 3;
@@ -176,7 +184,7 @@ __device__ uint32_t InitLevelSection(page_state_s* s,
     s->error = 3;
     len      = 0;
   }
-  return (uint32_t)len;
+  return static_cast<uint32_t>(len);
 }
 
 /**
@@ -277,13 +285,18 @@ __device__ void gpuDecodeStream(
  * 31)
  * @param[in] t Warp1 thread ID (0..31)
  *
- * @return The new output position
+ * @return A pair containing the new output position, and the total length of strings decoded (this
+ * will only be valid on thread 0 and if sizes_only is true)
  */
-__device__ int gpuDecodeDictionaryIndices(volatile page_state_s* s, int target_pos, int t)
+template <bool sizes_only>
+__device__ cuda::std::pair<int, int> gpuDecodeDictionaryIndices(volatile page_state_s* s,
+                                                                int target_pos,
+                                                                int t)
 {
   const uint8_t* end = s->data_end;
   int dict_bits      = s->dict_bits;
   int pos            = s->dict_pos;
+  int str_len        = 0;
 
   while (pos < target_pos) {
     int is_literal, batch_len;
@@ -328,8 +341,11 @@ __device__ int gpuDecodeDictionaryIndices(volatile page_state_s* s, int target_p
     __syncwarp();
     is_literal = shuffle(is_literal);
     batch_len  = shuffle(batch_len);
+
+    // compute dictionary index.
+    int dict_idx = 0;
     if (t < batch_len) {
-      int dict_idx = s->dict_val;
+      dict_idx = s->dict_val;
       if (is_literal) {
         int32_t ofs      = (t - ((batch_len + 7) & ~7)) * dict_bits;
         const uint8_t* p = s->data_start + (ofs >> 3);
@@ -349,11 +365,36 @@ __device__ int gpuDecodeDictionaryIndices(volatile page_state_s* s, int target_p
           dict_idx &= (1 << dict_bits) - 1;
         }
       }
-      s->dict_idx[(pos + t) & (non_zero_buffer_size - 1)] = dict_idx;
+
+      // if we're not computing sizes, store off the dictionary index
+      if constexpr (!sizes_only) { s->dict_idx[(pos + t) & (non_zero_buffer_size - 1)] = dict_idx; }
+    }
+
+    // if we're computing sizes, add the length(s)
+    if constexpr (sizes_only) {
+      int const len = [&]() {
+        if (t >= batch_len) { return 0; }
+        // we may end up decoding more indices than we asked for. so don't include those in the
+        // size calculation
+        if (pos + t >= target_pos) { return 0; }
+        // TODO:  refactor this with gpuGetStringData / gpuGetStringSize
+        uint32_t const dict_pos = (s->dict_bits > 0) ? dict_idx * sizeof(string_index_pair) : 0;
+        if (target_pos && dict_pos < (uint32_t)s->dict_size) {
+          const auto* src = reinterpret_cast<const string_index_pair*>(s->dict_base + dict_pos);
+          return src->second;
+        }
+        return 0;
+      }();
+
+      using WarpReduce = cub::WarpReduce<size_type>;
+      __shared__ typename WarpReduce::TempStorage temp_storage;
+      // note: str_len will only be valid on thread 0.
+      str_len += WarpReduce(temp_storage).Sum(len);
     }
+
     pos += batch_len;
   }
-  return pos;
+  return {pos, str_len};
 }
 
 /**
@@ -420,17 +461,20 @@ __device__ int gpuDecodeRleBooleans(volatile page_state_s* s, int target_pos, in
 }
 
 /**
- * @brief Parses the length and position of strings
+ * @brief Parses the length and position of strings and returns total length of all strings
+ * processed
  *
  * @param[in,out] s Page state input/output
  * @param[in] target_pos Target output position
  * @param[in] t Thread ID
  *
- * @return The new output position
+ * @return Total length of strings processed
  */
-__device__ void gpuInitStringDescriptors(volatile page_state_s* s, int target_pos, int t)
+__device__ size_type gpuInitStringDescriptors(volatile page_state_s* s, int target_pos, int t)
 {
-  int pos = s->dict_pos;
+  int pos       = s->dict_pos;
+  int total_len = 0;
+
   // This step is purely serial
   if (!t) {
     const uint8_t* cur = s->data_start;
@@ -449,21 +493,26 @@ __device__ void gpuInitStringDescriptors(volatile page_state_s* s, int target_po
       s->dict_idx[pos & (non_zero_buffer_size - 1)] = k;
       s->str_len[pos & (non_zero_buffer_size - 1)]  = len;
       k += len;
+      total_len += len;
       pos++;
     }
     s->dict_val = k;
     __threadfence_block();
   }
+
+  return total_len;
 }
 
 /**
- * @brief Output a string descriptor
+ * @brief Retrieves string information for a string at the specified source position
  *
- * @param[in,out] s Page state input/output
+ * @param[in] s Page state input
  * @param[in] src_pos Source position
- * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash)
+ *
+ * @return A pair containing a pointer to the string and its length
  */
-inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, void* dstv)
+inline __device__ cuda::std::pair<const char*, size_t> gpuGetStringData(volatile page_state_s* s,
+                                                                        int src_pos)
 {
   const char* ptr = nullptr;
   size_t len      = 0;
@@ -486,6 +535,20 @@ inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, vo
       len = s->str_len[src_pos & (non_zero_buffer_size - 1)];
     }
   }
+
+  return {ptr, len};
+}
+
+/**
+ * @brief Output a string descriptor
+ *
+ * @param[in,out] s Page state input/output
+ * @param[in] src_pos Source position
+ * @param[in] dstv Pointer to row output data (string descriptor or 32-bit hash)
+ */
+inline __device__ void gpuOutputString(volatile page_state_s* s, int src_pos, void* dstv)
+{
+  auto [ptr, len] = gpuGetStringData(s, src_pos);
   if (s->dtype_len == 4) {
     // Output hash. This hash value is used if the option to convert strings to
     // categoricals is enabled. The seed value is chosen arbitrarily.
@@ -814,14 +877,17 @@ static __device__ void gpuOutputGeneric(volatile page_state_s* s,
  * @param[in, out] s The local page state to be filled in
  * @param[in] p The global page to be copied from
  * @param[in] chunks The global list of chunks
- * @param[in] num_rows Maximum number of rows to read
  * @param[in] min_row Crop all rows below min_row
+ * @param[in] num_rows Maximum number of rows to read
+ * @param[in] is_decode_step If we are setting up for the decode step (instead of the preprocess
+ * step)
  */
 static __device__ bool setupLocalPageInfo(page_state_s* const s,
                                           PageInfo const* p,
                                           device_span<ColumnChunkDesc const> chunks,
                                           size_t min_row,
-                                          size_t num_rows)
+                                          size_t num_rows,
+                                          bool is_decode_step)
 {
   int t = threadIdx.x;
   int chunk_idx;
@@ -872,15 +938,15 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
         case BOOLEAN:
           s->dtype_len = 1;  // Boolean are stored as 1 byte on the output
           break;
-        case INT32:
+        case INT32: [[fallthrough]];
         case FLOAT: s->dtype_len = 4; break;
         case INT64:
           if (s->col.ts_clock_rate) {
             int32_t units = 0;
-            if (s->col.converted_type == TIME_MILLIS or s->col.converted_type == TIMESTAMP_MILLIS) {
+            // Duration types are not included because no scaling is done when reading
+            if (s->col.converted_type == TIMESTAMP_MILLIS) {
               units = cudf::timestamp_ms::period::den;
-            } else if (s->col.converted_type == TIME_MICROS or
-                       s->col.converted_type == TIMESTAMP_MICROS) {
+            } else if (s->col.converted_type == TIMESTAMP_MICROS) {
               units = cudf::timestamp_us::period::den;
             } else if (s->col.logical_type.TIMESTAMP.unit.isset.NANOS) {
               units = cudf::timestamp_ns::period::den;
@@ -890,7 +956,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
                                                            : (s->col.ts_clock_rate / units);
             }
           }
-          // Fall through to DOUBLE
+          [[fallthrough]];
         case DOUBLE: s->dtype_len = 8; break;
         case INT96: s->dtype_len = 12; break;
         case BYTE_ARRAY: s->dtype_len = sizeof(string_index_pair); break;
@@ -906,25 +972,41 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
                        : s->dtype_len <= sizeof(int64_t) ? sizeof(int64_t)
                                                          : sizeof(__int128_t);
       } else if (data_type == INT32) {
-        if (dtype_len_out == 1) s->dtype_len = 1;  // INT8 output
-        if (dtype_len_out == 2) s->dtype_len = 2;  // INT16 output
+        if (dtype_len_out == 1) {
+          // INT8 output
+          s->dtype_len = 1;
+        } else if (dtype_len_out == 2) {
+          // INT16 output
+          s->dtype_len = 2;
+        } else if (s->col.converted_type == TIME_MILLIS) {
+          // INT64 output
+          s->dtype_len = 8;
+        }
       } else if (data_type == BYTE_ARRAY && dtype_len_out == 4) {
         s->dtype_len = 4;  // HASH32 output
       } else if (data_type == INT96) {
         s->dtype_len = 8;  // Convert to 64-bit timestamp
       }
 
-      // first row within the page to output
-      if (page_start_row >= min_row) {
-        s->first_row = 0;
-      } else {
-        s->first_row = (int32_t)min(min_row - page_start_row, (size_t)s->page.num_rows);
-      }
-      // # of rows within the page to output
-      s->num_rows = s->page.num_rows;
-      if ((page_start_row + s->first_row) + s->num_rows > min_row + num_rows) {
-        s->num_rows =
-          (int32_t)max((int64_t)(min_row + num_rows - (page_start_row + s->first_row)), INT64_C(0));
+      // NOTE: s->page.num_rows, s->col.chunk_row, s->first_row and s->num_rows will be
+      // invalid/bogus during first pass of the preprocess step for nested types. this is ok
+      // because we ignore these values in that stage.
+      {
+        auto const max_row = min_row + num_rows;
+
+        // if we are totally outside the range of the input, do nothing
+        if ((page_start_row > max_row) || (page_start_row + s->page.num_rows < min_row)) {
+          s->first_row = 0;
+          s->num_rows  = 0;
+        }
+        // otherwise
+        else {
+          s->first_row             = page_start_row >= min_row ? 0 : min_row - page_start_row;
+          auto const max_page_rows = s->page.num_rows - s->first_row;
+          s->num_rows              = (page_start_row + s->first_row) + max_page_rows <= max_row
+                                       ? max_page_rows
+                                       : max_row - (page_start_row + s->first_row);
+        }
       }
 
       // during the decoding step we need to offset the global output buffers
@@ -932,7 +1014,11 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
       // is responsible for.
       // - for flat schemas, we can do this directly by using row counts
       // - for nested schemas, these offsets are computed during the preprocess step
-      if (s->col.column_data_base != nullptr) {
+      //
+      // NOTE: in a chunked read situation, s->col.column_data_base and s->col.valid_map_base
+      // will be aliased to memory that has been freed when we get here in the non-decode step, so
+      // we cannot check against nullptr.  we'll just check a flag directly.
+      if (is_decode_step) {
         int max_depth = s->col.max_nesting_depth;
         for (int idx = 0; idx < max_depth; idx++) {
           PageNestingInfo* pni = &s->page.nesting[idx];
@@ -942,12 +1028,13 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
           if (s->col.max_level[level_type::REPETITION] == 0) {
             output_offset = page_start_row >= min_row ? page_start_row - min_row : 0;
           }
-          // for schemas with lists, we've already got the exactly value precomputed
+          // for schemas with lists, we've already got the exact value precomputed
           else {
             output_offset = pni->page_start_value;
           }
 
           pni->data_out = static_cast<uint8_t*>(s->col.column_data_base[idx]);
+
           if (pni->data_out != nullptr) {
             // anything below max depth with a valid data pointer must be a list, so the
             // element size is the size of the offset type.
@@ -1024,6 +1111,7 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
       s->page.skipped_leaf_values = 0;
       s->input_value_count        = 0;
       s->input_row_count          = 0;
+      s->input_leaf_count         = 0;
 
       s->row_index_lower_bound = -1;
     }
@@ -1052,13 +1140,14 @@ static __device__ bool setupLocalPageInfo(page_state_s* const s,
 
       // if we're in the decoding step, jump directly to the first
       // value we care about
-      if (s->col.column_data_base != nullptr) {
+      if (is_decode_step) {
         s->input_value_count = s->page.skipped_values > -1 ? s->page.skipped_values : 0;
       } else {
-        s->input_value_count        = 0;
-        s->input_leaf_count         = 0;
-        s->page.skipped_values      = -1;
-        s->page.skipped_leaf_values = -1;
+        s->input_value_count = 0;
+        s->input_leaf_count  = 0;
+        s->page.skipped_values =
+          -1;  // magic number to indicate it hasn't been set for use inside UpdatePageSizes
+        s->page.skipped_leaf_values = 0;
       }
     }
 
@@ -1175,7 +1264,8 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
                                                              int t)
 {
   // max nesting depth of the column
-  int const max_depth = s->col.max_nesting_depth;
+  int const max_depth       = s->col.max_nesting_depth;
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
   // how many (input) values we've processed in the page so far
   int input_value_count = s->input_value_count;
   // how many rows we've processed in the page so far
@@ -1235,7 +1325,7 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
       uint32_t const warp_valid_mask =
         // for flat schemas, a simple ballot_sync gives us the correct count and bit positions
         // because every value in the input matches to a value in the output
-        max_depth == 1
+        !has_repetition
           ? ballot(is_valid)
           :
           // for nested schemas, it's more complicated.  This warp will visit 32 incoming values,
@@ -1284,11 +1374,12 @@ static __device__ void gpuUpdateValidityOffsetsAndRowIndices(int32_t target_inpu
       // the correct position to start reading. since we are about to write the validity vector here
       // we need to adjust our computed mask to take into account the write row bounds.
       int const in_write_row_bounds =
-        max_depth == 1
+        !has_repetition
           ? thread_row_index >= s->first_row && thread_row_index < (s->first_row + s->num_rows)
           : in_row_bounds;
       int const first_thread_in_write_range =
-        max_depth == 1 ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
+        !has_repetition ? __ffs(ballot(in_write_row_bounds)) - 1 : 0;
+
       // # of bits to of the validity mask to write out
       int const warp_valid_mask_bit_count =
         first_thread_in_write_range < 0 ? 0 : warp_value_count - first_thread_in_write_range;
@@ -1383,8 +1474,7 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
                                           bool bounds_set)
 {
   // max nesting depth of the column
-  int max_depth = s->col.max_nesting_depth;
-  // bool has_repetition = s->col.max_level[level_type::REPETITION] > 0 ? true : false;
+  int const max_depth = s->col.max_nesting_depth;
   // how many input level values we've processed in the page so far
   int input_value_count = s->input_value_count;
   // how many leaf values we've processed in the page so far
@@ -1398,11 +1488,10 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
       start_depth, end_depth, d, s, input_value_count, target_input_value_count, t);
 
     // count rows and leaf values
-    int is_new_row                = start_depth == 0 ? 1 : 0;
-    uint32_t warp_row_count_mask  = ballot(is_new_row);
-    int is_new_leaf               = (d >= s->page.nesting[max_depth - 1].max_def_level) ? 1 : 0;
-    uint32_t warp_leaf_count_mask = ballot(is_new_leaf);
-
+    int const is_new_row               = start_depth == 0 ? 1 : 0;
+    uint32_t const warp_row_count_mask = ballot(is_new_row);
+    int const is_new_leaf = (d >= s->page.nesting[max_depth - 1].max_def_level) ? 1 : 0;
+    uint32_t const warp_leaf_count_mask = ballot(is_new_leaf);
     // is this thread within row bounds? on the first pass we don't know the bounds, so we will be
     // computing the full size of the column.  on the second pass, we will know our actual row
     // bounds, so the computation will cap sizes properly.
@@ -1416,8 +1505,8 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
                         ? 1
                         : 0;
 
-      uint32_t row_bounds_mask  = ballot(in_row_bounds);
-      int first_thread_in_range = __ffs(row_bounds_mask) - 1;
+      uint32_t const row_bounds_mask  = ballot(in_row_bounds);
+      int const first_thread_in_range = __ffs(row_bounds_mask) - 1;
 
       // if we've found the beginning of the first row, mark down the position
       // in the def/repetition buffer (skipped_values) and the data buffer (skipped_leaf_values)
@@ -1430,13 +1519,15 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
       }
     }
 
-    // increment counts across all nesting depths
+    // increment value counts across all nesting depths
     for (int s_idx = 0; s_idx < max_depth; s_idx++) {
-      // if we are within the range of nesting levels we should be adding value indices for
-      int in_nesting_bounds = (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds) ? 1 : 0;
+      PageNestingInfo* pni = &s->page.nesting[s_idx];
 
-      uint32_t count_mask = ballot(in_nesting_bounds);
-      if (!t) { s->page.nesting[s_idx].size += __popc(count_mask); }
+      // if we are within the range of nesting levels we should be adding value indices for
+      int const in_nesting_bounds =
+        (s_idx >= start_depth && s_idx <= end_depth && in_row_bounds) ? 1 : 0;
+      uint32_t const count_mask = ballot(in_nesting_bounds);
+      if (!t) { pni->batch_size += __popc(count_mask); }
     }
 
     input_value_count += min(32, (target_input_value_count - input_value_count));
@@ -1452,6 +1543,21 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
   }
 }
 
+__device__ size_type gpuGetStringSize(page_state_s* s, int target_count, int t)
+{
+  auto dict_target_pos = target_count;
+  size_type str_len    = 0;
+  if (s->dict_base) {
+    auto const [new_target_pos, len] = gpuDecodeDictionaryIndices<true>(s, target_count, t);
+    dict_target_pos                  = new_target_pos;
+    str_len                          = len;
+  } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
+    str_len = gpuInitStringDescriptors(s, target_count, t);
+  }
+  if (!t) { *(volatile int32_t*)&s->dict_pos = dict_target_pos; }
+  return str_len;
+}
+
 /**
  * @brief Kernel for computing per-page column size information for all nesting levels.
  *
@@ -1460,17 +1566,20 @@ static __device__ void gpuUpdatePageSizes(page_state_s* s,
  * @param pages List of pages
  * @param chunks List of column chunks
  * @param min_row Row index to start reading at
- * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows.
- * @param trim_pass Whether or not this is the trim pass.  We first have to compute
+ * @param num_rows Maximum number of rows to read. Pass as INT_MAX to guarantee reading all rows
+ * @param is_base_pass Whether or not this is the base pass.  We first have to compute
  * the full size information of every page before we come through in a second (trim) pass
- * to determine what subset of rows in this page we should be reading.
+ * to determine what subset of rows in this page we should be reading
+ * @param compute_string_sizes Whether or not we should be computing string sizes
+ * (PageInfo::str_bytes) as part of the pass
  */
 __global__ void __launch_bounds__(block_size)
   gpuComputePageSizes(PageInfo* pages,
                       device_span<ColumnChunkDesc const> chunks,
                       size_t min_row,
                       size_t num_rows,
-                      bool trim_pass)
+                      bool is_base_pass,
+                      bool compute_string_sizes)
 {
   __shared__ __align__(16) page_state_s state_g;
 
@@ -1479,32 +1588,82 @@ __global__ void __launch_bounds__(block_size)
   int t                 = threadIdx.x;
   PageInfo* pp          = &pages[page_idx];
 
-  if (!setupLocalPageInfo(s, pp, chunks, trim_pass ? min_row : 0, trim_pass ? num_rows : INT_MAX)) {
-    return;
-  }
+  if (!setupLocalPageInfo(s, pp, chunks, min_row, num_rows, false)) { return; }
 
-  // zero sizes
-  int d = 0;
-  while (d < s->page.num_nesting_levels) {
-    if (d + t < s->page.num_nesting_levels) { s->page.nesting[d + t].size = 0; }
-    d += blockDim.x;
-  }
   if (!t) {
     s->page.skipped_values      = -1;
-    s->page.skipped_leaf_values = -1;
+    s->page.skipped_leaf_values = 0;
+    s->page.str_bytes           = 0;
     s->input_row_count          = 0;
     s->input_value_count        = 0;
 
-    // if this isn't the trim pass, make sure we visit absolutely everything
-    if (!trim_pass) {
+    // in the base pass, we're computing the number of rows, make sure we visit absolutely
+    // everything
+    if (is_base_pass) {
       s->first_row             = 0;
       s->num_rows              = INT_MAX;
       s->row_index_lower_bound = -1;
     }
   }
-  __syncthreads();
 
-  bool has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+  // we only need to preprocess hierarchies with repetition in them (ie, hierarchies
+  // containing lists anywhere within).
+  bool const has_repetition = chunks[pp->chunk_idx].max_level[level_type::REPETITION] > 0;
+  compute_string_sizes =
+    compute_string_sizes && ((s->col.data_type & 7) == BYTE_ARRAY && s->dtype_len != 4);
+
+  // various early out optimizations:
+
+  // - if this is a flat hierarchy (no lists) and is not a string column. in this case we don't need
+  // to do
+  //   the expensive work of traversing the level data to determine sizes.  we can just compute it
+  //   directly.
+  if (!has_repetition && !compute_string_sizes) {
+    int d = 0;
+    while (d < s->page.num_nesting_levels) {
+      auto const i = d + t;
+      if (i < s->page.num_nesting_levels) {
+        if (is_base_pass) { pp->nesting[i].size = pp->num_input_values; }
+        pp->nesting[i].batch_size = pp->num_input_values;
+      }
+      d += blockDim.x;
+    }
+    return;
+  }
+
+  // - if this page is not at the beginning or end of the trim bounds, the batch size is
+  //   the full page size
+  if (!is_base_pass && s->num_rows == s->page.num_rows) {
+    int d = 0;
+    while (d < s->page.num_nesting_levels) {
+      auto const i = d + t;
+      if (i < s->page.num_nesting_levels) { pp->nesting[i].batch_size = pp->nesting[i].size; }
+      d += blockDim.x;
+    }
+    return;
+  }
+
+  // - if this page is completely trimmed, zero out sizes.
+  if (!is_base_pass && s->num_rows == 0) {
+    int d = 0;
+    while (d < s->page.num_nesting_levels) {
+      auto const i = d + t;
+      if (i < s->page.num_nesting_levels) { pp->nesting[i].batch_size = 0; }
+      d += blockDim.x;
+    }
+    return;
+  }
+
+  // at this point we are going to be fully recomputing batch information
+
+  // zero sizes
+  int d = 0;
+  while (d < s->page.num_nesting_levels) {
+    if (d + t < s->page.num_nesting_levels) { s->page.nesting[d + t].batch_size = 0; }
+    d += blockDim.x;
+  }
+
+  __syncthreads();
 
   // optimization : it might be useful to have a version of gpuDecodeStream that could go wider than
   // 1 warp.  Currently it only uses 1 warp so that it can overlap work with the value decoding step
@@ -1528,16 +1687,39 @@ __global__ void __launch_bounds__(block_size)
                                               : s->lvl_count[level_type::DEFINITION];
 
       // process what we got back
-      gpuUpdatePageSizes(s, actual_input_count, t, trim_pass);
+      gpuUpdatePageSizes(s, actual_input_count, t, !is_base_pass);
+      if (compute_string_sizes) {
+        auto const str_len = gpuGetStringSize(s, s->input_leaf_count, t);
+        if (!t) { s->page.str_bytes += str_len; }
+      }
+
       target_input_count = actual_input_count + batch_size;
       __syncwarp();
     }
   }
-  // update # rows in the actual page
+
+  // update output results:
+  // - real number of rows for the whole page
+  // - nesting sizes for the whole page
+  // - skipped value information for trimmed pages
+  // - string bytes
+  if (is_base_pass) {
+    // nesting level 0 is the root column, so the size is also the # of rows
+    if (!t) { pp->num_rows = s->page.nesting[0].batch_size; }
+
+    // store off this batch size as the "full" size
+    int d = 0;
+    while (d < s->page.num_nesting_levels) {
+      auto const i = d + t;
+      if (i < s->page.num_nesting_levels) { pp->nesting[i].size = pp->nesting[i].batch_size; }
+      d += blockDim.x;
+    }
+  }
+
   if (!t) {
-    pp->num_rows            = s->page.nesting[0].size;
     pp->skipped_values      = s->page.skipped_values;
     pp->skipped_leaf_values = s->page.skipped_leaf_values;
+    pp->str_bytes           = s->page.str_bytes;
   }
 }
 
@@ -1564,7 +1746,10 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
   int t                 = threadIdx.x;
   int out_thread0;
 
-  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows)) { return; }
+  if (!setupLocalPageInfo(s, &pages[page_idx], chunks, min_row, num_rows, true)) { return; }
+
+  // if we have no rows to do (eg, in a skip_rows/num_rows case)
+  if (s->num_rows == 0) { return; }
 
   if (s->dict_base) {
     out_thread0 = (s->dict_bits > 0) ? 64 : 32;
@@ -1573,6 +1758,8 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
       ((s->col.data_type & 7) == BOOLEAN || (s->col.data_type & 7) == BYTE_ARRAY) ? 64 : 32;
   }
 
+  bool const has_repetition = s->col.max_level[level_type::REPETITION] > 0;
+
   // skipped_leaf_values will always be 0 for flat hierarchies.
   uint32_t skipped_leaf_values = s->page.skipped_leaf_values;
   while (!s->error && (s->input_value_count < s->num_input_values || s->src_pos < s->nz_count)) {
@@ -1599,7 +1786,7 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
 
       // WARP1: Decode dictionary indices, booleans or string positions
       if (s->dict_base) {
-        src_target_pos = gpuDecodeDictionaryIndices(s, src_target_pos, t & 0x1f);
+        src_target_pos = gpuDecodeDictionaryIndices<false>(s, src_target_pos, t & 0x1f).first;
       } else if ((s->col.data_type & 7) == BOOLEAN) {
         src_target_pos = gpuDecodeRleBooleans(s, src_target_pos, t & 0x1f);
       } else if ((s->col.data_type & 7) == BYTE_ARRAY) {
@@ -1625,7 +1812,7 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
       // - so we will end up ignoring the first two input rows, and input rows 2..n will
       //   get written to the output starting at position 0.
       //
-      if (s->col.max_nesting_depth == 1) { dst_pos -= s->first_row; }
+      if (!has_repetition) { dst_pos -= s->first_row; }
 
       // target_pos will always be properly bounded by num_rows, but dst_pos may be negative (values
       // before first_row) in the flat hierarchy case.
@@ -1663,7 +1850,12 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
         } else if (dtype == INT96) {
           gpuOutputInt96Timestamp(s, val_src_pos, static_cast<int64_t*>(dst));
         } else if (dtype_len == 8) {
-          if (s->ts_scale) {
+          if (s->dtype_len_in == 4) {
+            // Reading INT32 TIME_MILLIS into 64-bit DURATION_MILLISECONDS
+            // TIME_MILLIS is the only duration type stored as int32:
+            // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md#deprecated-time-convertedtype
+            gpuOutputFast(s, val_src_pos, static_cast<uint32_t*>(dst));
+          } else if (s->ts_scale) {
             gpuOutputInt64Timestamp(s, val_src_pos, static_cast<int64_t*>(dst));
           } else {
             gpuOutputFast(s, val_src_pos, static_cast<uint2*>(dst));
@@ -1681,71 +1873,18 @@ __global__ void __launch_bounds__(block_size) gpuDecodePageData(
   }
 }
 
-struct chunk_row_output_iter {
-  PageInfo* p;
-  using value_type        = size_type;
-  using difference_type   = size_type;
-  using pointer           = size_type*;
-  using reference         = size_type&;
-  using iterator_category = thrust::output_device_iterator_tag;
-
-  __host__ __device__ chunk_row_output_iter operator+(int i)
-  {
-    return chunk_row_output_iter{p + i};
-  }
-
-  __host__ __device__ void operator++() { p++; }
-
-  __device__ reference operator[](int i) { return p[i].chunk_row; }
-  __device__ reference operator*() { return p->chunk_row; }
-  __device__ void operator=(value_type v) { p->chunk_row = v; }
-};
-
-struct start_offset_output_iterator {
-  PageInfo* pages;
-  int* page_indices;
-  int cur_index;
-  int src_col_schema;
-  int nesting_depth;
-  int empty               = 0;
-  using value_type        = size_type;
-  using difference_type   = size_type;
-  using pointer           = size_type*;
-  using reference         = size_type&;
-  using iterator_category = thrust::output_device_iterator_tag;
-
-  __host__ __device__ start_offset_output_iterator operator+(int i)
-  {
-    return start_offset_output_iterator{
-      pages, page_indices, cur_index + i, src_col_schema, nesting_depth};
-  }
-
-  __host__ __device__ void operator++() { cur_index++; }
-
-  __device__ reference operator[](int i) { return dereference(cur_index + i); }
-  __device__ reference operator*() { return dereference(cur_index); }
-
- private:
-  __device__ reference dereference(int index)
-  {
-    PageInfo const& p = pages[page_indices[index]];
-    if (p.src_col_schema != src_col_schema || p.flags & PAGEINFO_FLAGS_DICTIONARY) { return empty; }
-    return p.nesting[nesting_depth].page_start_value;
-  }
-};
+}  // anonymous namespace
 
 /**
- * @copydoc cudf::io::parquet::gpu::PreprocessColumnData
+ * @copydoc cudf::io::parquet::gpu::ComputePageSizes
  */
-void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
-                          hostdevice_vector<ColumnChunkDesc> const& chunks,
-                          std::vector<input_column_info>& input_columns,
-                          std::vector<cudf::io::detail::column_buffer>& output_columns,
-                          size_t num_rows,
-                          size_t min_row,
-                          bool uses_custom_row_bounds,
-                          rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr)
+void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
+                      hostdevice_vector<ColumnChunkDesc> const& chunks,
+                      size_t min_row,
+                      size_t num_rows,
+                      bool compute_num_rows,
+                      bool compute_string_sizes,
+                      rmm::cuda_stream_view stream)
 {
   dim3 dim_block(block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
@@ -1756,132 +1895,7 @@ void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
   // If uses_custom_row_bounds is set to true, we have to do a second pass later that "trims"
   // the starting and ending read values to account for these bounds.
   gpuComputePageSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
-    pages.device_ptr(),
-    chunks,
-    // if uses_custom_row_bounds is false, include all possible rows.
-    uses_custom_row_bounds ? min_row : 0,
-    uses_custom_row_bounds ? num_rows : INT_MAX,
-    !uses_custom_row_bounds);
-
-  // computes:
-  // PageInfo::chunk_row for all pages
-  auto key_input = thrust::make_transform_iterator(
-    pages.device_ptr(), [] __device__(PageInfo const& page) { return page.chunk_idx; });
-  auto page_input = thrust::make_transform_iterator(
-    pages.device_ptr(), [] __device__(PageInfo const& page) { return page.num_rows; });
-  thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
-                                key_input,
-                                key_input + pages.size(),
-                                page_input,
-                                chunk_row_output_iter{pages.device_ptr()});
-
-  // computes:
-  // PageNestingInfo::size for each level of nesting, for each page, taking row bounds into account.
-  // PageInfo::skipped_values, which tells us where to start decoding in the input  .
-  // It is only necessary to do this second pass if uses_custom_row_bounds is set (if the user has
-  // specified artifical bounds).
-  if (uses_custom_row_bounds) {
-    gpuComputePageSizes<<<dim_grid, dim_block, 0, stream.value()>>>(
-      pages.device_ptr(), chunks, min_row, num_rows, true);
-  }
-
-  // ordering of pages is by input column schema, repeated across row groups.  so
-  // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like
-  //
-  // 1, 1, 2, 2, 3, 3
-  //
-  // However, if we had more than one row group, the pattern would be
-  //
-  // 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3
-  // ^ row group 0     |
-  //                   ^ row group 1
-  //
-  // To use exclusive_scan_by_key, the ordering we actually want is
-  //
-  // 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
-  //
-  // We also need to preserve key-relative page ordering, so we need to use a stable sort.
-  rmm::device_uvector<int> page_keys(pages.size(), stream);
-  rmm::device_uvector<int> page_index(pages.size(), stream);
-  {
-    thrust::transform(rmm::exec_policy(stream),
-                      pages.device_ptr(),
-                      pages.device_ptr() + pages.size(),
-                      page_keys.begin(),
-                      [] __device__(PageInfo const& page) { return page.src_col_schema; });
-
-    thrust::sequence(rmm::exec_policy(stream), page_index.begin(), page_index.end());
-    thrust::stable_sort_by_key(rmm::exec_policy(stream),
-                               page_keys.begin(),
-                               page_keys.end(),
-                               page_index.begin(),
-                               thrust::less<int>());
-  }
-
-  // compute output column sizes by examining the pages of the -input- columns
-  for (size_t idx = 0; idx < input_columns.size(); idx++) {
-    auto const& input_col = input_columns[idx];
-    auto src_col_schema   = input_col.schema_idx;
-    size_t max_depth      = input_col.nesting_depth();
-
-    auto* cols = &output_columns;
-    for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
-      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
-      cols          = &out_buf.children;
-
-      // size iterator. indexes pages by sorted order
-      auto size_input = thrust::make_transform_iterator(
-        page_index.begin(),
-        [src_col_schema, l_idx, pages = pages.device_ptr()] __device__(int index) {
-          auto const& page = pages[index];
-          if (page.src_col_schema != src_col_schema || page.flags & PAGEINFO_FLAGS_DICTIONARY) {
-            return 0;
-          }
-          return page.nesting[l_idx].size;
-        });
-
-      // compute column size.
-      // for struct columns, higher levels of the output columns are shared between input
-      // columns. so don't compute any given level more than once.
-      if (out_buf.size == 0) {
-        int size = thrust::reduce(rmm::exec_policy(stream), size_input, size_input + pages.size());
-
-        // Handle a specific corner case.  It is possible to construct a parquet file such that
-        // a column within a row group contains more rows than the row group itself. This may be
-        // invalid, but we have seen instances of this in the wild, including how they were created
-        // using the apache parquet tools.  Normally, the trim pass would handle this case quietly,
-        // but if we are not running the trim pass (which is most of the time) we need to cap the
-        // number of rows we will allocate/read from the file with the amount specified in the
-        // associated row group. This only applies to columns that are not children of lists as
-        // those may have an arbitrary number of rows in them.
-        if (!uses_custom_row_bounds &&
-            !(out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) &&
-            size > static_cast<size_type>(num_rows)) {
-          size = static_cast<size_type>(num_rows);
-        }
-
-        // if this is a list column add 1 for non-leaf levels for the terminating offset
-        if (out_buf.type.id() == type_id::LIST && l_idx < max_depth) { size++; }
-
-        // allocate
-        out_buf.create(size, stream, mr);
-      }
-
-      // compute per-page start offset
-      thrust::exclusive_scan_by_key(rmm::exec_policy(stream),
-                                    page_keys.begin(),
-                                    page_keys.end(),
-                                    size_input,
-                                    start_offset_output_iterator{pages.device_ptr(),
-                                                                 page_index.begin(),
-                                                                 0,
-                                                                 static_cast<int>(src_col_schema),
-                                                                 static_cast<int>(l_idx)});
-    }
-  }
-
-  // retrieve pages back
-  pages.device_to_host(stream);
+    pages.device_ptr(), chunks, min_row, num_rows, compute_num_rows, compute_string_sizes);
 }
 
 /**
@@ -1893,6 +1907,8 @@ void __host__ DecodePageData(hostdevice_vector<PageInfo>& pages,
                              size_t min_row,
                              rmm::cuda_stream_view stream)
 {
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
   dim3 dim_block(block_size, 1);
   dim3 dim_grid(pages.size(), 1);  // 1 threadblock per page
 
diff --git a/cpp/src/io/parquet/page_enc.cu b/cpp/src/io/parquet/page_enc.cu
index cdee066a06a..74e98de4100 100644
--- a/cpp/src/io/parquet/page_enc.cu
+++ b/cpp/src/io/parquet/page_enc.cu
@@ -13,7 +13,8 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include "parquet_gpu.hpp"
+
+#include "parquet_gpu.cuh"
 
 #include <io/utilities/block_utils.cuh>
 
@@ -61,6 +62,12 @@ constexpr int32_t NO_TRUNC_STATS = 0;
 // minimum scratch space required for encoding statistics
 constexpr size_t MIN_STATS_SCRATCH_SIZE = sizeof(__int128_t);
 
+// mask to determine lane id
+constexpr uint32_t WARP_MASK = cudf::detail::warp_size - 1;
+
+// currently 64k - 1
+constexpr uint32_t MAX_GRID_Y_SIZE = (1 << 16) - 1;
+
 struct frag_init_state_s {
   parquet_column_device_view col;
   PageFragment frag;
@@ -116,82 +123,87 @@ __global__ void __launch_bounds__(block_size)
   using block_reduce = cub::BlockReduce<uint32_t, block_size>;
   __shared__ typename block_reduce::TempStorage reduce_storage;
 
-  frag_init_state_s* const s = &state_g;
-  uint32_t t                 = threadIdx.x;
-  int frag_y                 = blockIdx.y;
-  auto const physical_type   = col_desc[blockIdx.x].physical_type;
+  frag_init_state_s* const s              = &state_g;
+  uint32_t const t                        = threadIdx.x;
+  auto const physical_type                = col_desc[blockIdx.x].physical_type;
+  uint32_t const num_fragments_per_column = frag.size().second;
 
-  if (t == 0) s->col = col_desc[blockIdx.x];
+  if (t == 0) { s->col = col_desc[blockIdx.x]; }
   __syncthreads();
-  if (!t) {
-    // Find which partition this fragment came from
-    auto it =
-      thrust::upper_bound(thrust::seq, part_frag_offset.begin(), part_frag_offset.end(), frag_y);
-    int p             = it - part_frag_offset.begin() - 1;
-    int part_end_row  = partitions[p].start_row + partitions[p].num_rows;
-    s->frag.start_row = (frag_y - part_frag_offset[p]) * fragment_size + partitions[p].start_row;
-
-    // frag.num_rows = fragment_size except for the last fragment in partition which can be smaller.
-    // num_rows is fixed but fragment size could be larger if the data is strings or nested.
-    s->frag.num_rows           = min(fragment_size, part_end_row - s->frag.start_row);
-    s->frag.num_dict_vals      = 0;
-    s->frag.fragment_data_size = 0;
-    s->frag.dict_data_size     = 0;
-
-    s->frag.start_value_idx = row_to_value_idx(s->frag.start_row, s->col);
-    size_type end_value_idx = row_to_value_idx(s->frag.start_row + s->frag.num_rows, s->col);
-    s->frag.num_leaf_values = end_value_idx - s->frag.start_value_idx;
-
-    if (s->col.level_offsets != nullptr) {
-      // For nested schemas, the number of values in a fragment is not directly related to the
-      // number of encoded data elements or the number of rows.  It is simply the number of
-      // repetition/definition values which together encode validity and nesting information.
-      size_type first_level_val_idx = s->col.level_offsets[s->frag.start_row];
-      size_type last_level_val_idx  = s->col.level_offsets[s->frag.start_row + s->frag.num_rows];
-      s->frag.num_values            = last_level_val_idx - first_level_val_idx;
-    } else {
-      s->frag.num_values = s->frag.num_rows;
-    }
-  }
+
   auto const leaf_type = s->col.leaf_column->type().id();
   auto const dtype_len = physical_type_len(physical_type, leaf_type);
-  __syncthreads();
 
-  size_type nvals           = s->frag.num_leaf_values;
-  size_type start_value_idx = s->frag.start_value_idx;
-
-  for (uint32_t i = 0; i < nvals; i += block_size) {
-    uint32_t val_idx  = start_value_idx + i + t;
-    uint32_t is_valid = (i + t < nvals && val_idx < s->col.leaf_column->size())
-                          ? s->col.leaf_column->is_valid(val_idx)
-                          : 0;
-    uint32_t len;
-    if (is_valid) {
-      len = dtype_len;
-      if (physical_type == BYTE_ARRAY) {
-        switch (leaf_type) {
-          case type_id::STRING: {
-            auto str = s->col.leaf_column->element<string_view>(val_idx);
-            len += str.size_bytes();
-          } break;
-          case type_id::LIST: {
-            auto list_element =
-              get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx);
-            len += list_element.size_bytes();
-          } break;
-          default: CUDF_UNREACHABLE("Unsupported data type for leaf column");
-        }
+  for (uint32_t frag_y = blockIdx.y; frag_y < num_fragments_per_column; frag_y += gridDim.y) {
+    if (t == 0) {
+      // Find which partition this fragment came from
+      auto it =
+        thrust::upper_bound(thrust::seq, part_frag_offset.begin(), part_frag_offset.end(), frag_y);
+      int p             = it - part_frag_offset.begin() - 1;
+      int part_end_row  = partitions[p].start_row + partitions[p].num_rows;
+      s->frag.start_row = (frag_y - part_frag_offset[p]) * fragment_size + partitions[p].start_row;
+
+      // frag.num_rows = fragment_size except for the last fragment in partition which can be
+      // smaller. num_rows is fixed but fragment size could be larger if the data is strings or
+      // nested.
+      s->frag.num_rows           = min(fragment_size, part_end_row - s->frag.start_row);
+      s->frag.num_dict_vals      = 0;
+      s->frag.fragment_data_size = 0;
+      s->frag.dict_data_size     = 0;
+
+      s->frag.start_value_idx = row_to_value_idx(s->frag.start_row, s->col);
+      size_type end_value_idx = row_to_value_idx(s->frag.start_row + s->frag.num_rows, s->col);
+      s->frag.num_leaf_values = end_value_idx - s->frag.start_value_idx;
+
+      if (s->col.level_offsets != nullptr) {
+        // For nested schemas, the number of values in a fragment is not directly related to the
+        // number of encoded data elements or the number of rows.  It is simply the number of
+        // repetition/definition values which together encode validity and nesting information.
+        size_type first_level_val_idx = s->col.level_offsets[s->frag.start_row];
+        size_type last_level_val_idx  = s->col.level_offsets[s->frag.start_row + s->frag.num_rows];
+        s->frag.num_values            = last_level_val_idx - first_level_val_idx;
+      } else {
+        s->frag.num_values = s->frag.num_rows;
       }
-    } else {
-      len = 0;
     }
+    __syncthreads();
+
+    size_type nvals           = s->frag.num_leaf_values;
+    size_type start_value_idx = s->frag.start_value_idx;
 
-    len = block_reduce(reduce_storage).Sum(len);
-    if (!t) { s->frag.fragment_data_size += len; }
+    for (uint32_t i = 0; i < nvals; i += block_size) {
+      uint32_t val_idx  = start_value_idx + i + t;
+      uint32_t is_valid = (i + t < nvals && val_idx < s->col.leaf_column->size())
+                            ? s->col.leaf_column->is_valid(val_idx)
+                            : 0;
+      uint32_t len;
+      if (is_valid) {
+        len = dtype_len;
+        if (physical_type == BYTE_ARRAY) {
+          switch (leaf_type) {
+            case type_id::STRING: {
+              auto str = s->col.leaf_column->element<string_view>(val_idx);
+              len += str.size_bytes();
+            } break;
+            case type_id::LIST: {
+              auto list_element =
+                get_element<statistics::byte_array_view>(*s->col.leaf_column, val_idx);
+              len += list_element.size_bytes();
+            } break;
+            default: CUDF_UNREACHABLE("Unsupported data type for leaf column");
+          }
+        }
+      } else {
+        len = 0;
+      }
+
+      len = block_reduce(reduce_storage).Sum(len);
+      if (t == 0) { s->frag.fragment_data_size += len; }
+      __syncthreads();
+    }
     __syncthreads();
+    if (t == 0) { frag[blockIdx.x][frag_y] = s->frag; }
   }
-  __syncthreads();
-  if (t == 0) frag[blockIdx.x][blockIdx.y] = s->frag;
 }
 
 // blockDim {128,1,1}
@@ -200,21 +212,29 @@ __global__ void __launch_bounds__(128)
                        device_2dspan<PageFragment const> fragments,
                        device_span<parquet_column_device_view const> col_desc)
 {
-  // TODO: why not 1 block per warp?
-  __shared__ __align__(8) statistics_group group_g[4];
-
-  uint32_t lane_id              = threadIdx.x & 0x1f;
-  uint32_t frag_id              = blockIdx.y * 4 + (threadIdx.x >> 5);
-  uint32_t column_id            = blockIdx.x;
-  auto num_fragments_per_column = fragments.size().second;
-  statistics_group* const g     = &group_g[threadIdx.x >> 5];
-  if (!lane_id && frag_id < num_fragments_per_column) {
-    g->col       = &col_desc[column_id];
-    g->start_row = fragments[column_id][frag_id].start_value_idx;
-    g->num_rows  = fragments[column_id][frag_id].num_leaf_values;
+  uint32_t const lane_id                  = threadIdx.x & WARP_MASK;
+  uint32_t const column_id                = blockIdx.x;
+  uint32_t const num_fragments_per_column = fragments.size().second;
+
+  uint32_t frag_id = blockIdx.y * 4 + (threadIdx.x / cudf::detail::warp_size);
+  while (frag_id < num_fragments_per_column) {
+    if (lane_id == 0) {
+      statistics_group g;
+      g.col                      = &col_desc[column_id];
+      g.start_row                = fragments[column_id][frag_id].start_value_idx;
+      g.num_rows                 = fragments[column_id][frag_id].num_leaf_values;
+      groups[column_id][frag_id] = g;
+    }
+    frag_id += gridDim.y * 4;
   }
-  __syncthreads();
-  if (frag_id < num_fragments_per_column and lane_id == 0) groups[column_id][frag_id] = *g;
+}
+
+constexpr uint32_t max_RLE_page_size(uint8_t value_bit_width, uint32_t num_values)
+{
+  if (value_bit_width == 0) return 0;
+
+  // Run length = 4, max(rle/bitpack header) = 5, add one byte per 256 values for overhead
+  return 4 + 5 + util::div_rounding_up_unsafe(num_values * value_bit_width, 8) + (num_values / 256);
 }
 
 // blockDim {128,1,1}
@@ -329,7 +349,7 @@ __global__ void __launch_bounds__(128)
       __syncwarp();
       uint32_t fragment_data_size =
         (ck_g.use_dictionary)
-          ? frag_g.num_leaf_values * 2  // Assume worst-case of 2-bytes per dictionary index
+          ? frag_g.num_leaf_values * util::div_rounding_up_unsafe(ck_g.dict_rle_bits, 8)
           : frag_g.fragment_data_size;
       // TODO (dm): this convoluted logic to limit page size needs refactoring
       size_t this_max_page_size = (values_in_page * 2 >= ck_g.num_values)   ? 256 * 1024
@@ -343,8 +363,8 @@ __global__ void __launch_bounds__(128)
           (values_in_page > 0 && (page_size + fragment_data_size > this_max_page_size)) ||
           rows_in_page >= max_page_size_rows) {
         if (ck_g.use_dictionary) {
-          page_size =
-            1 + 5 + ((values_in_page * ck_g.dict_rle_bits + 7) >> 3) + (values_in_page >> 8);
+          // Additional byte to store entry bit width
+          page_size = 1 + max_RLE_page_size(ck_g.dict_rle_bits, values_in_page);
         }
         if (!t) {
           page_g.num_fragments = fragments_in_chunk - page_start;
@@ -367,23 +387,13 @@ __global__ void __launch_bounds__(128)
           if (not comp_page_sizes.empty()) {
             page_g.compressed_data = ck_g.compressed_bfr + comp_page_offset;
           }
-          page_g.start_row        = cur_row;
-          page_g.num_rows         = rows_in_page;
-          page_g.num_leaf_values  = leaf_values_in_page;
-          page_g.num_values       = values_in_page;
-          uint32_t def_level_bits = col_g.num_def_level_bits();
-          uint32_t rep_level_bits = col_g.num_rep_level_bits();
-          // Run length = 4, max(rle/bitpack header) = 5, add one byte per 256 values for overhead
-          // TODO (dm): Improve readability of these calculations.
-          uint32_t def_level_size =
-            (def_level_bits != 0)
-              ? 4 + 5 + ((def_level_bits * page_g.num_values + 7) >> 3) + (page_g.num_values >> 8)
-              : 0;
-          uint32_t rep_level_size =
-            (rep_level_bits != 0)
-              ? 4 + 5 + ((rep_level_bits * page_g.num_values + 7) >> 3) + (page_g.num_values >> 8)
-              : 0;
-          page_g.max_data_size = page_size + def_level_size + rep_level_size;
+          page_g.start_row          = cur_row;
+          page_g.num_rows           = rows_in_page;
+          page_g.num_leaf_values    = leaf_values_in_page;
+          page_g.num_values         = values_in_page;
+          auto const def_level_size = max_RLE_page_size(col_g.num_def_level_bits(), values_in_page);
+          auto const rep_level_size = max_RLE_page_size(col_g.num_rep_level_bits(), values_in_page);
+          page_g.max_data_size      = page_size + def_level_size + rep_level_size;
 
           pagestats_g.start_chunk = ck_g.first_fragment + page_start;
           pagestats_g.num_chunks  = page_g.num_fragments;
@@ -1100,15 +1110,20 @@ __global__ void __launch_bounds__(128, 8)
       if (t == 0) { s->cur = dst + total_len; }
       if (is_valid) {
         switch (physical_type) {
-          case INT32:
+          case INT32: [[fallthrough]];
           case FLOAT: {
-            int32_t v;
-            if (dtype_len_in == 4)
-              v = s->col.leaf_column->element<int32_t>(val_idx);
-            else if (dtype_len_in == 2)
-              v = s->col.leaf_column->element<int16_t>(val_idx);
-            else
-              v = s->col.leaf_column->element<int8_t>(val_idx);
+            auto const v = [dtype_len = dtype_len_in,
+                            idx       = val_idx,
+                            col       = s->col.leaf_column,
+                            scale     = s->col.ts_scale == 0 ? 1 : s->col.ts_scale]() -> int32_t {
+              switch (dtype_len) {
+                case 8: return col->element<int64_t>(idx) * scale;
+                case 4: return col->element<int32_t>(idx) * scale;
+                case 2: return col->element<int16_t>(idx) * scale;
+                default: return col->element<int8_t>(idx) * scale;
+              }
+            }();
+
             dst[pos + 0] = v;
             dst[pos + 1] = v >> 8;
             dst[pos + 2] = v >> 16;
@@ -2017,9 +2032,10 @@ void InitPageFragments(device_2dspan<PageFragment> frag,
                        uint32_t fragment_size,
                        rmm::cuda_stream_view stream)
 {
-  auto num_columns              = frag.size().first;
-  auto num_fragments_per_column = frag.size().second;
-  dim3 dim_grid(num_columns, num_fragments_per_column);  // 1 threadblock per fragment
+  auto const num_columns              = frag.size().first;
+  auto const num_fragments_per_column = frag.size().second;
+  auto const grid_y = std::min(static_cast<uint32_t>(num_fragments_per_column), MAX_GRID_Y_SIZE);
+  dim3 const dim_grid(num_columns, grid_y);  // 1 threadblock per fragment
   gpuInitPageFragments<512><<<dim_grid, 512, 0, stream.value()>>>(
     frag, col_desc, partitions, part_frag_offset, fragment_size);
 }
@@ -2031,8 +2047,10 @@ void InitFragmentStatistics(device_2dspan<statistics_group> groups,
 {
   int const num_columns              = col_desc.size();
   int const num_fragments_per_column = fragments.size().second;
-  auto grid_y = util::div_rounding_up_safe(num_fragments_per_column, 128 / cudf::detail::warp_size);
-  dim3 dim_grid(num_columns, grid_y);  // 1 warp per fragment
+  auto const y_dim =
+    util::div_rounding_up_safe(num_fragments_per_column, 128 / cudf::detail::warp_size);
+  auto const grid_y = std::min(static_cast<uint32_t>(y_dim), MAX_GRID_Y_SIZE);
+  dim3 const dim_grid(num_columns, grid_y);  // 1 warp per fragment
   gpuInitFragmentStats<<<dim_grid, 128, 0, stream.value()>>>(groups, fragments, col_desc);
 }
 
diff --git a/cpp/src/io/parquet/page_hdr.cu b/cpp/src/io/parquet/page_hdr.cu
index e7856a871c1..ffb4cb60a20 100644
--- a/cpp/src/io/parquet/page_hdr.cu
+++ b/cpp/src/io/parquet/page_hdr.cu
@@ -307,10 +307,11 @@ struct gpuParseDataPageHeaderV2 {
   __device__ bool operator()(byte_stream_s* bs)
   {
     auto op = thrust::make_tuple(ParquetFieldInt32(1, bs->page.num_input_values),
+                                 ParquetFieldInt32(2, bs->page.num_nulls),
                                  ParquetFieldInt32(3, bs->page.num_rows),
                                  ParquetFieldEnum<Encoding>(4, bs->page.encoding),
-                                 ParquetFieldEnum<Encoding>(5, bs->page.definition_level_encoding),
-                                 ParquetFieldEnum<Encoding>(6, bs->page.repetition_level_encoding));
+                                 ParquetFieldInt32(5, bs->page.def_lvl_bytes),
+                                 ParquetFieldInt32(6, bs->page.rep_lvl_bytes));
     return parse_header(op, bs);
   }
 };
@@ -366,6 +367,7 @@ __global__ void __launch_bounds__(128)
       // definition levels
       bs->page.chunk_row = 0;
       bs->page.num_rows  = 0;
+      bs->page.str_bytes = 0;
     }
     num_values     = bs->ck.num_values;
     page_info      = bs->ck.page_info;
@@ -382,18 +384,30 @@ __global__ void __launch_bounds__(128)
         // definition levels
         bs->page.chunk_row += bs->page.num_rows;
         bs->page.num_rows = 0;
+        // zero out V2 info
+        bs->page.num_nulls     = 0;
+        bs->page.def_lvl_bytes = 0;
+        bs->page.rep_lvl_bytes = 0;
         if (parse_page_header(bs) && bs->page.compressed_page_size >= 0) {
           switch (bs->page_type) {
             case PageType::DATA_PAGE:
+              index_out = num_dict_pages + data_page_count;
+              data_page_count++;
+              bs->page.flags = 0;
               // this computation is only valid for flat schemas. for nested schemas,
               // they will be recomputed in the preprocess step by examining repetition and
               // definition levels
               bs->page.num_rows = bs->page.num_input_values;
+              values_found += bs->page.num_input_values;
+              break;
             case PageType::DATA_PAGE_V2:
               index_out = num_dict_pages + data_page_count;
               data_page_count++;
               bs->page.flags = 0;
               values_found += bs->page.num_input_values;
+              // V2 only uses RLE, so it was removed from the header
+              bs->page.definition_level_encoding = Encoding::RLE;
+              bs->page.repetition_level_encoding = Encoding::RLE;
               break;
             case PageType::DICTIONARY_PAGE:
               index_out = dictionary_page_count;
diff --git a/cpp/src/io/parquet/parquet_gpu.cuh b/cpp/src/io/parquet/parquet_gpu.cuh
new file mode 100644
index 00000000000..793573b465e
--- /dev/null
+++ b/cpp/src/io/parquet/parquet_gpu.cuh
@@ -0,0 +1,85 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "parquet_gpu.hpp"
+
+#include <cudf/lists/lists_column_device_view.cuh>
+#include <cudf/types.hpp>
+
+#include <cuco/static_map.cuh>
+
+namespace cudf::io::parquet::gpu {
+
+auto constexpr KEY_SENTINEL   = size_type{-1};
+auto constexpr VALUE_SENTINEL = size_type{-1};
+
+using map_type = cuco::static_map<size_type, size_type>;
+
+/**
+ * @brief The alias of `map_type::pair_atomic_type` class.
+ *
+ * Declare this struct by trivial subclassing instead of type aliasing so we can have forward
+ * declaration of this struct somewhere else.
+ */
+struct slot_type : public map_type::pair_atomic_type {
+};
+
+/**
+ * @brief Return the byte length of parquet dtypes that are physically represented by INT32
+ */
+inline uint32_t __device__ int32_logical_len(type_id id)
+{
+  switch (id) {
+    case cudf::type_id::INT8: [[fallthrough]];
+    case cudf::type_id::UINT8: return 1;
+    case cudf::type_id::INT16: [[fallthrough]];
+    case cudf::type_id::UINT16: return 2;
+    case cudf::type_id::DURATION_SECONDS: [[fallthrough]];
+    case cudf::type_id::DURATION_MILLISECONDS: return 8;
+    default: return 4;
+  }
+}
+
+/**
+ * @brief Translate the row index of a parent column_device_view into the index of the first value
+ * in the leaf child.
+ * Only works in the context of parquet writer where struct columns are previously modified s.t.
+ * they only have one immediate child.
+ */
+inline size_type __device__ row_to_value_idx(size_type idx,
+                                             parquet_column_device_view const& parquet_col)
+{
+  // with a byte array, we can't go all the way down to the leaf node, but instead we want to leave
+  // the size at the parent level because we are writing out parent row byte arrays.
+  auto col = *parquet_col.parent_column;
+  while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
+    if (col.type().id() == type_id::STRUCT) {
+      idx += col.offset();
+      col = col.child(0);
+    } else {
+      auto list_col = cudf::detail::lists_column_device_view(col);
+      auto child    = list_col.child();
+      if (parquet_col.output_as_byte_array && child.type().id() == type_id::UINT8) { break; }
+      idx = list_col.offset_at(idx);
+      col = child;
+    }
+  }
+  return idx;
+}
+
+}  // namespace cudf::io::parquet::gpu
diff --git a/cpp/src/io/parquet/parquet_gpu.hpp b/cpp/src/io/parquet/parquet_gpu.hpp
index 8f4cd5c6f3b..ccf4b056ae8 100644
--- a/cpp/src/io/parquet/parquet_gpu.hpp
+++ b/cpp/src/io/parquet/parquet_gpu.hpp
@@ -23,14 +23,10 @@
 #include "io/utilities/column_buffer.hpp"
 #include "io/utilities/hostdevice_vector.hpp"
 
-#include <cudf/column/column_device_view.cuh>
-#include <cudf/lists/lists_column_device_view.cuh>
-#include <cudf/table/table_device_view.cuh>
+#include <cudf/io/datasource.hpp>
 #include <cudf/types.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <cuco/static_map.cuh>
-
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_scalar.hpp>
 #include <rmm/device_uvector.hpp>
@@ -39,9 +35,7 @@
 
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace parquet {
+namespace cudf::io::parquet {
 
 using cudf::io::detail::string_index_pair;
 
@@ -57,19 +51,21 @@ constexpr size_type MAX_DICT_SIZE = (1 << MAX_DICT_BITS) - 1;
 struct input_column_info {
   int schema_idx;
   std::string name;
+  bool has_repetition;
   // size == nesting depth. the associated real output
   // buffer index in the dest column for each level of nesting.
   std::vector<int> nesting;
+
+  input_column_info(int _schema_idx, std::string _name, bool _has_repetition)
+    : schema_idx(_schema_idx), name(_name), has_repetition(_has_repetition)
+  {
+  }
+
   auto nesting_depth() const { return nesting.size(); }
 };
 
 namespace gpu {
 
-auto constexpr KEY_SENTINEL   = size_type{-1};
-auto constexpr VALUE_SENTINEL = size_type{-1};
-using map_type                = cuco::static_map<size_type, size_type>;
-using slot_type               = map_type::pair_atomic_type;
-
 /**
  * @brief Enums for the flags in the page header
  */
@@ -99,9 +95,13 @@ struct PageNestingInfo {
   // set at initialization
   int32_t max_def_level;
   int32_t max_rep_level;
+  cudf::type_id type;  // type of the corresponding cudf output column
+  bool nullable;
 
   // set during preprocessing
-  int32_t size;              // this page/nesting-level's size contribution to the output column
+  int32_t size;  // this page/nesting-level's row count contribution to the output column, if fully
+                 // decoded
+  int32_t batch_size;        // the size of the page for this batch
   int32_t page_start_value;  // absolute output start index in output column data
 
   // set during data decoding
@@ -121,6 +121,10 @@ struct PageInfo {
                        // decompression
   int32_t compressed_page_size;    // compressed data size in bytes
   int32_t uncompressed_page_size;  // uncompressed data size in bytes
+  // for V2 pages, the def and rep level data is not compressed, and lacks the 4-byte length
+  // indicator. instead the lengths for these are stored in the header.
+  int32_t def_lvl_bytes;  // length of the definition levels (V2 header)
+  int32_t rep_lvl_bytes;  // length of the repetition levels (V2 header)
   // Number of values in this data page or dictionary.
   // Important : the # of input values does not necessarily
   // correspond to the number of rows in the output. It just reflects the number
@@ -131,6 +135,7 @@ struct PageInfo {
   int32_t num_input_values;
   int32_t chunk_row;       // starting row of this page relative to the start of the chunk
   int32_t num_rows;        // number of rows in this page
+  int32_t num_nulls;       // number of null values (V2 header)
   int32_t chunk_idx;       // column chunk this page belongs to
   int32_t src_col_schema;  // schema index of this column
   uint8_t flags;           // PAGEINFO_FLAGS_XXX
@@ -150,6 +155,9 @@ struct PageInfo {
   int skipped_values;
   // # of values skipped in the actual data stream.
   int skipped_leaf_values;
+  // for string columns only, the size of all the chars in the string for
+  // this page. only valid/computed during the base preprocess pass
+  int32_t str_bytes;
 
   // nesting information (input/output) for each page
   int num_nesting_levels;
@@ -235,6 +243,34 @@ struct ColumnChunkDesc {
   int32_t src_col_schema;  // my schema index in the file
 };
 
+/**
+ * @brief Struct to store raw/intermediate file data before parsing.
+ */
+struct file_intermediate_data {
+  std::vector<std::unique_ptr<datasource::buffer>> raw_page_data;
+  rmm::device_buffer decomp_page_data;
+  hostdevice_vector<gpu::ColumnChunkDesc> chunks{};
+  hostdevice_vector<gpu::PageInfo> pages_info{};
+  hostdevice_vector<gpu::PageNestingInfo> page_nesting_info{};
+};
+
+/**
+ * @brief Struct to store intermediate page data for parsing each chunk of rows in chunked reading.
+ */
+struct chunk_intermediate_data {
+  rmm::device_uvector<int32_t> page_keys{0, rmm::cuda_stream_default};
+  rmm::device_uvector<int32_t> page_index{0, rmm::cuda_stream_default};
+  rmm::device_uvector<string_index_pair> str_dict_index{0, rmm::cuda_stream_default};
+};
+
+/**
+ * @brief Structs to identify the reading row range for each chunk of rows in chunked reading.
+ */
+struct chunk_read_info {
+  size_t skip_rows;
+  size_t num_rows;
+};
+
 /**
  * @brief Struct describing an encoder column
  */
@@ -281,51 +317,8 @@ struct PageFragment {
 constexpr unsigned int kDictHashBits = 16;
 constexpr size_t kDictScratchSize    = (1 << kDictHashBits) * sizeof(uint32_t);
 
-/**
- * @brief Return the byte length of parquet dtypes that are physically represented by INT32
- */
-inline uint32_t __device__ int32_logical_len(type_id id)
-{
-  switch (id) {
-    case cudf::type_id::INT8:
-    case cudf::type_id::UINT8: return 1;
-    case cudf::type_id::INT16:
-    case cudf::type_id::UINT16: return 2;
-    default: return 4;
-  }
-}
-
-/**
- * @brief Translate the row index of a parent column_device_view into the index of the first value
- * in the leaf child.
- * Only works in the context of parquet writer where struct columns are previously modified s.t.
- * they only have one immediate child.
- */
-inline size_type __device__ row_to_value_idx(size_type idx,
-                                             parquet_column_device_view const& parquet_col)
-{
-  // with a byte array, we can't go all the way down to the leaf node, but instead we want to leave
-  // the size at the parent level because we are writing out parent row byte arrays.
-  auto col = *parquet_col.parent_column;
-  while (col.type().id() == type_id::LIST or col.type().id() == type_id::STRUCT) {
-    if (col.type().id() == type_id::STRUCT) {
-      idx += col.offset();
-      col = col.child(0);
-    } else {
-      auto list_col = cudf::detail::lists_column_device_view(col);
-      auto child    = list_col.child();
-      if (parquet_col.output_as_byte_array &&
-          (child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) {
-        break;
-      }
-      idx = list_col.offset_at(idx);
-      col = child;
-    }
-  }
-  return idx;
-}
-
 struct EncPage;
+struct slot_type;
 
 /**
  * @brief Struct describing an encoder column chunk
@@ -408,35 +401,35 @@ void BuildStringDictionaryIndex(ColumnChunkDesc* chunks,
                                 rmm::cuda_stream_view stream);
 
 /**
- * @brief Preprocess column information for nested schemas.
+ * @brief Compute page output size information.
+ *
+ * When dealing with nested hierarchies (those that contain lists), or when doing a chunked
+ * read, we need to obtain more information up front than we have with just the row counts.
  *
- * There are several pieces of information we can't compute directly from row counts in
- * the parquet headers when dealing with nested schemas.
- * - The total sizes of all output columns at all nesting levels
- * - The starting output buffer offset for each page, for each nesting level
- * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders)
+ * - We need to determine the sizes of each output cudf column per page
+ * - We need to determine information about where to start decoding the value stream
+ *   if we are using custom user bounds (skip_rows / num_rows)
+ * - We need to determine actual number of top level rows per page
+ * - If we are doing a chunked read, we need to determine the total string size per page
  *
- * Note : this function is where output device memory is allocated for nested columns.
  *
  * @param pages All pages to be decoded
  * @param chunks All chunks to be decoded
- * @param input_columns Input column information
- * @param output_columns Output column information
  * @param num_rows Maximum number of rows to read
  * @param min_rows crop all rows below min_row
- * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific
- * bounds
- * @param stream Cuda stream
+ * @param compute_num_rows If set to true, the num_rows field in PageInfo will be
+ * computed
+ * @param compute_string_sizes If set to true, the str_bytes field in PageInfo will
+ * be computed
+ * @param stream CUDA stream to use, default 0
  */
-void PreprocessColumnData(hostdevice_vector<PageInfo>& pages,
-                          hostdevice_vector<ColumnChunkDesc> const& chunks,
-                          std::vector<input_column_info>& input_columns,
-                          std::vector<cudf::io::detail::column_buffer>& output_columns,
-                          size_t num_rows,
-                          size_t min_row,
-                          bool uses_custom_row_bounds,
-                          rmm::cuda_stream_view stream,
-                          rmm::mr::device_memory_resource* mr);
+void ComputePageSizes(hostdevice_vector<PageInfo>& pages,
+                      hostdevice_vector<ColumnChunkDesc> const& chunks,
+                      size_t num_rows,
+                      size_t min_row,
+                      bool compute_num_rows,
+                      bool compute_string_sizes,
+                      rmm::cuda_stream_view stream);
 
 /**
  * @brief Launches kernel for reading the column data stored in the pages
@@ -619,6 +612,4 @@ void EncodeColumnIndexes(device_span<EncColumnChunk> chunks,
                          rmm::cuda_stream_view stream);
 
 }  // namespace gpu
-}  // namespace parquet
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::parquet
diff --git a/cpp/src/io/parquet/reader.cpp b/cpp/src/io/parquet/reader.cpp
new file mode 100644
index 00000000000..1321e8073d7
--- /dev/null
+++ b/cpp/src/io/parquet/reader.cpp
@@ -0,0 +1,58 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+
+namespace cudf::io::detail::parquet {
+
+reader::reader() = default;
+
+reader::reader(std::vector<std::unique_ptr<datasource>>&& sources,
+               parquet_reader_options const& options,
+               rmm::cuda_stream_view stream,
+               rmm::mr::device_memory_resource* mr)
+  : _impl(std::make_unique<impl>(std::move(sources), options, stream, mr))
+{
+}
+
+reader::~reader() = default;
+
+table_with_metadata reader::read(parquet_reader_options const& options)
+{
+  // if the user has specified custom row bounds
+  bool const uses_custom_row_bounds = options.get_num_rows() >= 0 || options.get_skip_rows() != 0;
+  return _impl->read(options.get_skip_rows(),
+                     options.get_num_rows(),
+                     uses_custom_row_bounds,
+                     options.get_row_groups());
+}
+
+chunked_reader::chunked_reader(std::size_t chunk_read_limit,
+                               std::vector<std::unique_ptr<datasource>>&& sources,
+                               parquet_reader_options const& options,
+                               rmm::cuda_stream_view stream,
+                               rmm::mr::device_memory_resource* mr)
+{
+  _impl = std::make_unique<impl>(chunk_read_limit, std::move(sources), options, stream, mr);
+}
+
+chunked_reader::~chunked_reader() = default;
+
+bool chunked_reader::has_next() const { return _impl->has_next(); }
+
+table_with_metadata chunked_reader::read_chunk() const { return _impl->read_chunk(); }
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl.cpp b/cpp/src/io/parquet/reader_impl.cpp
new file mode 100644
index 00000000000..84d8cfc273f
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl.cpp
@@ -0,0 +1,370 @@
+/*
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <numeric>
+
+namespace cudf::io::detail::parquet {
+
+void reader::impl::decode_page_data(size_t skip_rows, size_t num_rows)
+{
+  auto& chunks       = _file_itm_data.chunks;
+  auto& pages        = _file_itm_data.pages_info;
+  auto& page_nesting = _file_itm_data.page_nesting_info;
+
+  // Should not reach here if there is no page data.
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to decode");
+
+  size_t const sum_max_depths = std::accumulate(
+    chunks.begin(), chunks.end(), 0, [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) {
+      return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
+    });
+
+  // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
+  // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
+  // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
+  auto chunk_nested_valids = hostdevice_vector<uint32_t*>(sum_max_depths, _stream);
+  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths, _stream);
+  auto chunk_offsets       = std::vector<size_t>();
+
+  // Update chunks with pointers to column data.
+  for (size_t c = 0, page_count = 0, chunk_off = 0; c < chunks.size(); c++) {
+    input_column_info const& input_col = _input_columns[chunks[c].src_col_index];
+    CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
+                 "Column/page schema index mismatch");
+
+    size_t max_depth = _metadata->get_output_nesting_depth(chunks[c].src_col_schema);
+    chunk_offsets.push_back(chunk_off);
+
+    // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers
+    // to validity data
+    auto valids              = chunk_nested_valids.host_ptr(chunk_off);
+    chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off);
+
+    // get a slice of size `nesting depth` from `chunk_nested_data` to store an array of pointers to
+    // out data
+    auto data                  = chunk_nested_data.host_ptr(chunk_off);
+    chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
+
+    chunk_off += max_depth;
+
+    // fill in the arrays on the host.  there are some important considerations to
+    // take into account here for nested columns.  specifically, with structs
+    // there is sharing of output buffers between input columns.  consider this schema
+    //
+    //  required group field_id=1 name {
+    //    required binary field_id=2 firstname (String);
+    //    required binary field_id=3 middlename (String);
+    //    required binary field_id=4 lastname (String);
+    // }
+    //
+    // there are 3 input columns of data here (firstname, middlename, lastname), but
+    // only 1 output column (name).  The structure of the output column buffers looks like
+    // the schema itself
+    //
+    // struct      (name)
+    //     string  (firstname)
+    //     string  (middlename)
+    //     string  (lastname)
+    //
+    // The struct column can contain validity information. the problem is, the decode
+    // step for the input columns will all attempt to decode this validity information
+    // because each one has it's own copy of the repetition/definition levels. but
+    // since this is all happening in parallel it would mean multiple blocks would
+    // be stomping all over the same memory randomly.  to work around this, we set
+    // things up so that only 1 child of any given nesting level fills in the
+    // data (offsets in the case of lists) or validity information for the higher
+    // levels of the hierarchy that are shared.  In this case, it would mean we
+    // would just choose firstname to be the one that decodes the validity for name.
+    //
+    // we do this by only handing out the pointers to the first child we come across.
+    //
+    auto* cols = &_output_buffers;
+    for (size_t idx = 0; idx < max_depth; idx++) {
+      auto& out_buf = (*cols)[input_col.nesting[idx]];
+      cols          = &out_buf.children;
+
+      int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
+      if (owning_schema == 0 || owning_schema == input_col.schema_idx) {
+        valids[idx] = out_buf.null_mask();
+        data[idx]   = out_buf.data();
+        out_buf.user_data |=
+          static_cast<uint32_t>(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
+      } else {
+        valids[idx] = nullptr;
+        data[idx]   = nullptr;
+      }
+    }
+
+    // column_data_base will always point to leaf data, even for nested types.
+    page_count += chunks[c].max_num_pages;
+  }
+
+  chunks.host_to_device(_stream);
+  chunk_nested_valids.host_to_device(_stream);
+  chunk_nested_data.host_to_device(_stream);
+
+  gpu::DecodePageData(pages, chunks, num_rows, skip_rows, _stream);
+
+  pages.device_to_host(_stream);
+  page_nesting.device_to_host(_stream);
+  _stream.synchronize();
+
+  // for list columns, add the final offset to every offset buffer.
+  // TODO : make this happen in more efficiently. Maybe use thrust::for_each
+  // on each buffer.
+  // Note : the reason we are doing this here instead of in the decode kernel is
+  // that it is difficult/impossible for a given page to know that it is writing the very
+  // last value that should then be followed by a terminator (because rows can span
+  // page boundaries).
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    input_column_info const& input_col = _input_columns[idx];
+
+    auto* cols = &_output_buffers;
+    for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
+      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
+      cols          = &out_buf.children;
+
+      if (out_buf.type.id() != type_id::LIST ||
+          (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED)) {
+        continue;
+      }
+      CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
+      auto& child = (*cols)[input_col.nesting[l_idx + 1]];
+
+      // the final offset for a list at level N is the size of it's child
+      int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
+      cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
+                      &offset,
+                      sizeof(offset),
+                      cudaMemcpyHostToDevice,
+                      _stream.value());
+      out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
+    }
+  }
+
+  // update null counts in the final column buffers
+  for (size_t idx = 0; idx < pages.size(); idx++) {
+    gpu::PageInfo* pi = &pages[idx];
+    if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    gpu::ColumnChunkDesc* col          = &chunks[pi->chunk_idx];
+    input_column_info const& input_col = _input_columns[col->src_col_index];
+
+    int index                 = pi->nesting - page_nesting.device_ptr();
+    gpu::PageNestingInfo* pni = &page_nesting[index];
+
+    auto* cols = &_output_buffers;
+    for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
+      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
+      cols          = &out_buf.children;
+
+      // if I wasn't the one who wrote out the validity bits, skip it
+      if (chunk_nested_valids.host_ptr(chunk_offsets[pi->chunk_idx])[l_idx] == nullptr) {
+        continue;
+      }
+      out_buf.null_count() += pni[l_idx].null_count;
+    }
+  }
+
+  _stream.synchronize();
+}
+
+reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
+                   parquet_reader_options const& options,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : impl(0 /*chunk_read_limit*/,
+         std::forward<std::vector<std::unique_ptr<cudf::io::datasource>>>(sources),
+         options,
+         stream,
+         mr)
+{
+}
+
+reader::impl::impl(std::size_t chunk_read_limit,
+                   std::vector<std::unique_ptr<datasource>>&& sources,
+                   parquet_reader_options const& options,
+                   rmm::cuda_stream_view stream,
+                   rmm::mr::device_memory_resource* mr)
+  : _stream{stream}, _mr{mr}, _sources{std::move(sources)}, _chunk_read_limit{chunk_read_limit}
+{
+  // Open and parse the source dataset metadata
+  _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
+
+  // Override output timestamp resolution if requested
+  if (options.get_timestamp_type().id() != type_id::EMPTY) {
+    _timestamp_type = options.get_timestamp_type();
+  }
+
+  // Strings may be returned as either string or categorical columns
+  _strings_to_categorical = options.is_enabled_convert_strings_to_categories();
+
+  // Binary columns can be read as binary or strings
+  _reader_column_schema = options.get_column_schema();
+
+  // Select only columns required by the options
+  std::tie(_input_columns, _output_buffers, _output_column_schemas) =
+    _metadata->select_columns(options.get_columns(),
+                              options.is_enabled_use_pandas_metadata(),
+                              _strings_to_categorical,
+                              _timestamp_type.id());
+
+  // Save the states of the output buffers for reuse in `chunk_read()`.
+  // Don't need to do it if we read the file all at once.
+  if (_chunk_read_limit > 0) {
+    for (auto const& buff : _output_buffers) {
+      _output_buffers_template.emplace_back(column_buffer::empty_like(buff));
+    }
+  }
+}
+
+void reader::impl::prepare_data(size_type skip_rows,
+                                size_type num_rows,
+                                bool uses_custom_row_bounds,
+                                host_span<std::vector<size_type> const> row_group_indices)
+{
+  if (_file_preprocessed) { return; }
+
+  const auto [skip_rows_corrected, num_rows_corrected, row_groups_info] =
+    _metadata->select_row_groups(row_group_indices, skip_rows, num_rows);
+
+  if (num_rows_corrected > 0 && row_groups_info.size() != 0 && _input_columns.size() != 0) {
+    load_and_decompress_data(row_groups_info, num_rows_corrected);
+    preprocess_pages(
+      skip_rows_corrected, num_rows_corrected, uses_custom_row_bounds, _chunk_read_limit);
+
+    if (_chunk_read_limit == 0) {  // read the whole file at once
+      CUDF_EXPECTS(_chunk_read_info.size() == 1,
+                   "Reading the whole file should yield only one chunk.");
+    }
+  }
+
+  _file_preprocessed = true;
+}
+
+table_with_metadata reader::impl::read_chunk_internal(bool uses_custom_row_bounds)
+{
+  // If `_output_metadata` has been constructed, just copy it over.
+  auto out_metadata = _output_metadata ? table_metadata{*_output_metadata} : table_metadata{};
+
+  // output cudf columns as determined by the top level schema
+  auto out_columns = std::vector<std::unique_ptr<column>>{};
+  out_columns.reserve(_output_buffers.size());
+
+  if (!has_next() || _chunk_read_info.size() == 0) {
+    return finalize_output(out_metadata, out_columns);
+  }
+
+  auto const& read_info = _chunk_read_info[_current_read_chunk++];
+
+  // Allocate memory buffers for the output columns.
+  allocate_columns(read_info.skip_rows, read_info.num_rows, uses_custom_row_bounds);
+
+  // Parse data into the output buffers.
+  decode_page_data(read_info.skip_rows, read_info.num_rows);
+
+  // Create the final output cudf columns.
+  for (size_t i = 0; i < _output_buffers.size(); ++i) {
+    auto const metadata = _reader_column_schema.has_value()
+                            ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
+                            : std::nullopt;
+    // Only construct `out_metadata` if `_output_metadata` has not been cached.
+    if (!_output_metadata) {
+      column_name_info& col_name = out_metadata.schema_info.emplace_back("");
+      out_columns.emplace_back(make_column(_output_buffers[i], &col_name, metadata, _stream, _mr));
+    } else {
+      out_columns.emplace_back(make_column(_output_buffers[i], nullptr, metadata, _stream, _mr));
+    }
+  }
+
+  // Add empty columns if needed.
+  return finalize_output(out_metadata, out_columns);
+}
+
+table_with_metadata reader::impl::finalize_output(table_metadata& out_metadata,
+                                                  std::vector<std::unique_ptr<column>>& out_columns)
+{
+  // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
+  for (size_t i = out_columns.size(); i < _output_buffers.size(); ++i) {
+    if (!_output_metadata) {
+      column_name_info& col_name = out_metadata.schema_info.emplace_back("");
+      out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], &col_name, _stream, _mr));
+    } else {
+      out_columns.emplace_back(io::detail::empty_like(_output_buffers[i], nullptr, _stream, _mr));
+    }
+  }
+
+  if (!_output_metadata) {
+    // Return column names (must match order of returned columns)
+    out_metadata.column_names.resize(_output_buffers.size());
+    for (size_t i = 0; i < _output_column_schemas.size(); i++) {
+      auto const& schema           = _metadata->get_schema(_output_column_schemas[i]);
+      out_metadata.column_names[i] = schema.name;
+    }
+
+    // Return user metadata
+    out_metadata.per_file_user_data = _metadata->get_key_value_metadata();
+    out_metadata.user_data          = {out_metadata.per_file_user_data[0].begin(),
+                              out_metadata.per_file_user_data[0].end()};
+
+    // Finally, save the output table metadata into `_output_metadata` for reuse next time.
+    _output_metadata = std::make_unique<table_metadata>(out_metadata);
+  }
+
+  return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
+}
+
+table_with_metadata reader::impl::read(size_type skip_rows,
+                                       size_type num_rows,
+                                       bool uses_custom_row_bounds,
+                                       host_span<std::vector<size_type> const> row_group_indices)
+{
+  CUDF_EXPECTS(_chunk_read_limit == 0, "Reading the whole file must not have non-zero byte_limit.");
+  prepare_data(skip_rows, num_rows, uses_custom_row_bounds, row_group_indices);
+  return read_chunk_internal(uses_custom_row_bounds);
+}
+
+table_with_metadata reader::impl::read_chunk()
+{
+  // Reset the output buffers to their original states (right after reader construction).
+  // Don't need to do it if we read the file all at once.
+  if (_chunk_read_limit > 0) {
+    _output_buffers.resize(0);
+    for (auto const& buff : _output_buffers_template) {
+      _output_buffers.emplace_back(column_buffer::empty_like(buff));
+    }
+  }
+
+  prepare_data(0 /*skip_rows*/,
+               -1 /*num_rows, `-1` means unlimited*/,
+               true /*uses_custom_row_bounds*/,
+               {} /*row_group_indices, empty means read all row groups*/);
+  return read_chunk_internal(true);
+}
+
+bool reader::impl::has_next()
+{
+  prepare_data(0 /*skip_rows*/,
+               -1 /*num_rows, `-1` means unlimited*/,
+               true /*uses_custom_row_bounds*/,
+               {} /*row_group_indices, empty means read all row groups*/);
+  return _current_read_chunk < _chunk_read_info.size();
+}
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl.cu b/cpp/src/io/parquet/reader_impl.cu
deleted file mode 100644
index 59bef6f5600..00000000000
--- a/cpp/src/io/parquet/reader_impl.cu
+++ /dev/null
@@ -1,1823 +0,0 @@
-/*
- * Copyright (c) 2019-2022, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/**
- * @file reader_impl.cu
- * @brief cuDF-IO Parquet reader class implementation
- */
-
-#include "reader_impl.hpp"
-
-#include "compact_protocol_reader.hpp"
-
-#include <io/comp/gpuinflate.hpp>
-#include <io/comp/nvcomp_adapter.hpp>
-#include <io/utilities/config_utils.hpp>
-#include <io/utilities/time_utils.cuh>
-
-#include <cudf/detail/utilities/integer_utils.hpp>
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/utilities/error.hpp>
-#include <cudf/utilities/traits.hpp>
-
-#include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
-#include <rmm/exec_policy.hpp>
-
-#include <thrust/fill.h>
-#include <thrust/for_each.h>
-#include <thrust/iterator/zip_iterator.h>
-#include <thrust/logical.h>
-#include <thrust/transform.h>
-#include <thrust/tuple.h>
-
-#include <algorithm>
-#include <array>
-#include <numeric>
-#include <regex>
-
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
-// Import functionality that's independent of legacy code
-using namespace cudf::io::parquet;
-using namespace cudf::io;
-
-namespace {
-
-parquet::ConvertedType logical_type_to_converted_type(parquet::LogicalType const& logical)
-{
-  if (logical.isset.STRING) {
-    return parquet::UTF8;
-  } else if (logical.isset.MAP) {
-    return parquet::MAP;
-  } else if (logical.isset.LIST) {
-    return parquet::LIST;
-  } else if (logical.isset.ENUM) {
-    return parquet::ENUM;
-  } else if (logical.isset.DECIMAL) {
-    return parquet::DECIMAL;  // TODO set decimal values
-  } else if (logical.isset.DATE) {
-    return parquet::DATE;
-  } else if (logical.isset.TIME) {
-    if (logical.TIME.unit.isset.MILLIS)
-      return parquet::TIME_MILLIS;
-    else if (logical.TIME.unit.isset.MICROS)
-      return parquet::TIME_MICROS;
-  } else if (logical.isset.TIMESTAMP) {
-    if (logical.TIMESTAMP.unit.isset.MILLIS)
-      return parquet::TIMESTAMP_MILLIS;
-    else if (logical.TIMESTAMP.unit.isset.MICROS)
-      return parquet::TIMESTAMP_MICROS;
-  } else if (logical.isset.INTEGER) {
-    switch (logical.INTEGER.bitWidth) {
-      case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
-      case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16;
-      case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32;
-      case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64;
-      default: break;
-    }
-  } else if (logical.isset.UNKNOWN) {
-    return parquet::NA;
-  } else if (logical.isset.JSON) {
-    return parquet::JSON;
-  } else if (logical.isset.BSON) {
-    return parquet::BSON;
-  }
-  return parquet::UNKNOWN;
-}
-
-/**
- * @brief Function that translates Parquet datatype to cuDF type enum
- */
-type_id to_type_id(SchemaElement const& schema,
-                   bool strings_to_categorical,
-                   type_id timestamp_type_id)
-{
-  parquet::Type const physical            = schema.type;
-  parquet::LogicalType const logical_type = schema.logical_type;
-  parquet::ConvertedType converted_type   = schema.converted_type;
-  int32_t decimal_scale                   = schema.decimal_scale;
-
-  // Logical type used for actual data interpretation; the legacy converted type
-  // is superceded by 'logical' type whenever available.
-  auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
-  if (inferred_converted_type != parquet::UNKNOWN) converted_type = inferred_converted_type;
-  if (inferred_converted_type == parquet::DECIMAL && decimal_scale == 0)
-    decimal_scale = schema.logical_type.DECIMAL.scale;
-
-  switch (converted_type) {
-    case parquet::UINT_8: return type_id::UINT8;
-    case parquet::INT_8: return type_id::INT8;
-    case parquet::UINT_16: return type_id::UINT16;
-    case parquet::INT_16: return type_id::INT16;
-    case parquet::UINT_32: return type_id::UINT32;
-    case parquet::UINT_64: return type_id::UINT64;
-    case parquet::DATE: return type_id::TIMESTAMP_DAYS;
-    case parquet::TIME_MILLIS:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::DURATION_MILLISECONDS;
-    case parquet::TIME_MICROS:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::DURATION_MICROSECONDS;
-    case parquet::TIMESTAMP_MILLIS:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_MILLISECONDS;
-    case parquet::TIMESTAMP_MICROS:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_MICROSECONDS;
-    case parquet::DECIMAL:
-      if (physical == parquet::INT32) { return type_id::DECIMAL32; }
-      if (physical == parquet::INT64) { return type_id::DECIMAL64; }
-      if (physical == parquet::FIXED_LEN_BYTE_ARRAY) {
-        if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
-          return type_id::DECIMAL32;
-        }
-        if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
-          return type_id::DECIMAL64;
-        }
-        if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
-          return type_id::DECIMAL128;
-        }
-      }
-      CUDF_FAIL("Invalid representation of decimal type");
-      break;
-
-    // maps are just List<Struct<>>.
-    case parquet::MAP:
-    case parquet::LIST: return type_id::LIST;
-    case parquet::NA: return type_id::STRING;
-    // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
-    default: break;
-  }
-
-  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
-      logical_type.TIMESTAMP.unit.isset.NANOS) {
-    return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                 : type_id::TIMESTAMP_NANOSECONDS;
-  }
-
-  // is it simply a struct?
-  if (schema.is_struct()) { return type_id::STRUCT; }
-
-  // Physical storage type supported by Parquet; controls the on-disk storage
-  // format in combination with the encoding type.
-  switch (physical) {
-    case parquet::BOOLEAN: return type_id::BOOL8;
-    case parquet::INT32: return type_id::INT32;
-    case parquet::INT64: return type_id::INT64;
-    case parquet::FLOAT: return type_id::FLOAT32;
-    case parquet::DOUBLE: return type_id::FLOAT64;
-    case parquet::BYTE_ARRAY:
-    case parquet::FIXED_LEN_BYTE_ARRAY:
-      // Can be mapped to INT32 (32-bit hash) or STRING
-      return strings_to_categorical ? type_id::INT32 : type_id::STRING;
-    case parquet::INT96:
-      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
-                                                   : type_id::TIMESTAMP_NANOSECONDS;
-    default: break;
-  }
-
-  return type_id::EMPTY;
-}
-
-/**
- * @brief Converts cuDF type enum to column logical type
- */
-data_type to_data_type(type_id t_id, SchemaElement const& schema)
-{
-  return t_id == type_id::DECIMAL32 || t_id == type_id::DECIMAL64 || t_id == type_id::DECIMAL128
-           ? data_type{t_id, numeric::scale_type{-schema.decimal_scale}}
-           : data_type{t_id};
-}
-
-/**
- * @brief Function that returns the required the number of bits to store a value
- */
-template <typename T = uint8_t>
-T required_bits(uint32_t max_level)
-{
-  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
-}
-
-/**
- * @brief Converts cuDF units to Parquet units.
- *
- * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
- */
-std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
-                                                     type_id timestamp_type_id,
-                                                     parquet::Type physical,
-                                                     int8_t converted,
-                                                     int32_t length)
-{
-  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
-  int32_t clock_rate = 0;
-  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
-    type_width = 1;  // I32 -> I8
-  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
-    type_width = 2;  // I32 -> I16
-  } else if (column_type_id == type_id::INT32) {
-    type_width = 4;  // str -> hash32
-  } else if (is_chrono(data_type{column_type_id})) {
-    clock_rate = to_clockrate(timestamp_type_id);
-  }
-
-  int8_t converted_type = converted;
-  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
-      not cudf::is_fixed_point(data_type{column_type_id})) {
-    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
-  }
-  return std::make_tuple(type_width, clock_rate, converted_type);
-}
-
-inline void decompress_check(device_span<compression_result const> results,
-                             rmm::cuda_stream_view stream)
-{
-  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
-                              results.begin(),
-                              results.end(),
-                              [] __device__(auto const& res) {
-                                return res.status == compression_status::SUCCESS;
-                              }),
-               "Error during decompression");
-}
-}  // namespace
-
-std::string name_from_path(const std::vector<std::string>& path_in_schema)
-{
-  // For the case of lists, we will see a schema that looks like:
-  // a.list.element.list.element
-  // where each (list.item) pair represents a level of nesting.  According to the parquet spec,
-  // https://github.com/apache/parquet-format/blob/master/LogicalTypes.md
-  // the initial field must be named "list" and the inner element must be named "element".
-  // If we are dealing with a list, we want to return the topmost name of the group ("a").
-  //
-  // For other nested schemas, like structs we just want to return the bottom-most name. For
-  // example a struct with the schema
-  // b.employee.id,  the column representing "id" should simply be named "id".
-  //
-  // In short, this means : return the highest level of the schema that does not have list
-  // definitions underneath it.
-  //
-  std::string s = (path_in_schema.size() > 0) ? path_in_schema[0] : "";
-  for (size_t i = 1; i < path_in_schema.size(); i++) {
-    // The Parquet spec requires that the outer schema field is named "list". However it also
-    // provides a list of backwards compatibility cases that are applicable as well.  Currently
-    // we are only handling the formal spec.  This will get cleaned up and improved when we add
-    // support for structs. The correct thing to do will probably be to examine the type of
-    // the SchemaElement itself to concretely identify the start of a nested type of any kind rather
-    // than trying to derive it from the path string.
-    if (path_in_schema[i] == "list") {
-      // Again, strictly speaking, the Parquet spec says the inner field should be named
-      // "element", but there are some backwards compatibility issues that we have seen in the
-      // wild. For example, Pandas calls the field "item".  We will allow any name for now.
-      i++;
-      continue;
-    }
-    // otherwise, we've got a real nested column. update the name
-    s = path_in_schema[i];
-  }
-  return s;
-}
-
-/**
- * @brief Class for parsing dataset metadata
- */
-struct metadata : public FileMetaData {
-  explicit metadata(datasource* source)
-  {
-    constexpr auto header_len = sizeof(file_header_s);
-    constexpr auto ender_len  = sizeof(file_ender_s);
-
-    const auto len           = source->size();
-    const auto header_buffer = source->host_read(0, header_len);
-    const auto header        = reinterpret_cast<const file_header_s*>(header_buffer->data());
-    const auto ender_buffer  = source->host_read(len - ender_len, ender_len);
-    const auto ender         = reinterpret_cast<const file_ender_s*>(ender_buffer->data());
-    CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source");
-    CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic,
-                 "Corrupted header or footer");
-    CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len),
-                 "Incorrect footer length");
-
-    const auto buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
-    CompactProtocolReader cp(buffer->data(), ender->footer_len);
-    CUDF_EXPECTS(cp.read(this), "Cannot parse metadata");
-    CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema");
-  }
-};
-
-class aggregate_reader_metadata {
-  std::vector<metadata> per_file_metadata;
-  std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
-  size_type num_rows;
-  size_type num_row_groups;
-  /**
-   * @brief Create a metadata object from each element in the source vector
-   */
-  auto metadatas_from_sources(std::vector<std::unique_ptr<datasource>> const& sources)
-  {
-    std::vector<metadata> metadatas;
-    std::transform(
-      sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
-        return metadata(source.get());
-      });
-    return metadatas;
-  }
-
-  /**
-   * @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps.
-   */
-  [[nodiscard]] auto collect_keyval_metadata()
-  {
-    std::vector<std::unordered_map<std::string, std::string>> kv_maps;
-    std::transform(per_file_metadata.cbegin(),
-                   per_file_metadata.cend(),
-                   std::back_inserter(kv_maps),
-                   [](auto const& pfm) {
-                     std::unordered_map<std::string, std::string> kv_map;
-                     std::transform(pfm.key_value_metadata.cbegin(),
-                                    pfm.key_value_metadata.cend(),
-                                    std::inserter(kv_map, kv_map.end()),
-                                    [](auto const& kv) {
-                                      return std::pair{kv.key, kv.value};
-                                    });
-                     return kv_map;
-                   });
-
-    return kv_maps;
-  }
-
-  /**
-   * @brief Sums up the number of rows of each source
-   */
-  [[nodiscard]] size_type calc_num_rows() const
-  {
-    return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
-        return sum + pfm.num_rows;
-      });
-  }
-
-  /**
-   * @brief Sums up the number of row groups of each source
-   */
-  [[nodiscard]] size_type calc_num_row_groups() const
-  {
-    return std::accumulate(
-      per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
-        return sum + pfm.row_groups.size();
-      });
-  }
-
- public:
-  aggregate_reader_metadata(std::vector<std::unique_ptr<datasource>> const& sources)
-    : per_file_metadata(metadatas_from_sources(sources)),
-      keyval_maps(collect_keyval_metadata()),
-      num_rows(calc_num_rows()),
-      num_row_groups(calc_num_row_groups())
-  {
-    // Verify that the input files have matching numbers of columns
-    size_type num_cols = -1;
-    for (auto const& pfm : per_file_metadata) {
-      if (pfm.row_groups.size() != 0) {
-        if (num_cols == -1)
-          num_cols = pfm.row_groups[0].columns.size();
-        else
-          CUDF_EXPECTS(num_cols == static_cast<size_type>(pfm.row_groups[0].columns.size()),
-                       "All sources must have the same number of columns");
-      }
-    }
-    // Verify that the input files have matching schemas
-    for (auto const& pfm : per_file_metadata) {
-      CUDF_EXPECTS(per_file_metadata[0].schema == pfm.schema,
-                   "All sources must have the same schemas");
-    }
-  }
-
-  [[nodiscard]] auto const& get_row_group(size_type row_group_index, size_type src_idx) const
-  {
-    CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast<size_type>(per_file_metadata.size()),
-                 "invalid source index");
-    return per_file_metadata[src_idx].row_groups[row_group_index];
-  }
-
-  [[nodiscard]] auto const& get_column_metadata(size_type row_group_index,
-                                                size_type src_idx,
-                                                int schema_idx) const
-  {
-    auto col = std::find_if(
-      per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
-      per_file_metadata[src_idx].row_groups[row_group_index].columns.end(),
-      [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx ? true : false; });
-    CUDF_EXPECTS(col != std::end(per_file_metadata[src_idx].row_groups[row_group_index].columns),
-                 "Found no metadata for schema index");
-    return col->meta_data;
-  }
-
-  [[nodiscard]] auto get_num_rows() const { return num_rows; }
-
-  [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
-
-  [[nodiscard]] auto const& get_schema(int schema_idx) const
-  {
-    return per_file_metadata[0].schema[schema_idx];
-  }
-
-  [[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; }
-
-  /**
-   * @brief Gets the concrete nesting depth of output cudf columns
-   *
-   * @param schema_index Schema index of the input column
-   *
-   * @return comma-separated index column names in quotes
-   */
-  [[nodiscard]] inline int get_output_nesting_depth(int schema_index) const
-  {
-    auto& pfm = per_file_metadata[0];
-    int depth = 0;
-
-    // walk upwards, skipping repeated fields
-    while (schema_index > 0) {
-      if (!pfm.schema[schema_index].is_stub()) { depth++; }
-      // schema of one-level encoding list doesn't contain nesting information, so we need to
-      // manually add an extra nesting level
-      if (pfm.schema[schema_index].is_one_level_list()) { depth++; }
-      schema_index = pfm.schema[schema_index].parent_idx;
-    }
-    return depth;
-  }
-
-  /**
-   * @brief Extracts the pandas "index_columns" section
-   *
-   * PANDAS adds its own metadata to the key_value section when writing out the
-   * dataframe to a file to aid in exact reconstruction. The JSON-formatted
-   * metadata contains the index column(s) and PANDA-specific datatypes.
-   *
-   * @return comma-separated index column names in quotes
-   */
-  [[nodiscard]] std::string get_pandas_index() const
-  {
-    // Assumes that all input files have the same metadata
-    // TODO: verify this assumption
-    auto it = keyval_maps[0].find("pandas");
-    if (it != keyval_maps[0].end()) {
-      // Captures a list of quoted strings found inside square brackets after `"index_columns":`
-      // Inside quotes supports newlines, brackets, escaped quotes, etc.
-      // One-liner regex:
-      // "index_columns"\s*:\s*\[\s*((?:"(?:|(?:.*?(?![^\\]")).?)[^\\]?",?\s*)*)\]
-      // Documented below.
-      std::regex index_columns_expr{
-        R"("index_columns"\s*:\s*\[\s*)"  // match preamble, opening square bracket, whitespace
-        R"(()"                            // Open first capturing group
-        R"((?:")"                         // Open non-capturing group match opening quote
-        R"((?:|(?:.*?(?![^\\]")).?))"     // match empty string or anything between quotes
-        R"([^\\]?")"                      // Match closing non-escaped quote
-        R"(,?\s*)"                        // Match optional comma and whitespace
-        R"()*)"                           // Close non-capturing group and repeat 0 or more times
-        R"())"                            // Close first capturing group
-        R"(\])"                           // Match closing square brackets
-      };
-      std::smatch sm;
-      if (std::regex_search(it->second, sm, index_columns_expr)) { return sm[1].str(); }
-    }
-    return "";
-  }
-
-  /**
-   * @brief Extracts the column name(s) used for the row indexes in a dataframe
-   *
-   * @param names List of column names to load, where index column name(s) will be added
-   */
-  [[nodiscard]] std::vector<std::string> get_pandas_index_names() const
-  {
-    std::vector<std::string> names;
-    auto str = get_pandas_index();
-    if (str.length() != 0) {
-      std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"};
-      std::smatch sm;
-      while (std::regex_search(str, sm, index_name_expr)) {
-        if (sm.size() == 2) {  // 2 = whole match, first item
-          if (std::find(names.begin(), names.end(), sm[1].str()) == names.end()) {
-            std::regex esc_quote{R"(\\")"};
-            names.emplace_back(std::regex_replace(sm[1].str(), esc_quote, R"(")"));
-          }
-        }
-        str = sm.suffix();
-      }
-    }
-    return names;
-  }
-
-  struct row_group_info {
-    size_type const index;
-    size_t const start_row;  // TODO source index
-    size_type const source_index;
-    row_group_info(size_type index, size_t start_row, size_type source_index)
-      : index(index), start_row(start_row), source_index(source_index)
-    {
-    }
-  };
-
-  /**
-   * @brief Filters and reduces down to a selection of row groups
-   *
-   * @param row_groups Lists of row groups to read, one per source
-   * @param row_start Starting row of the selection
-   * @param row_count Total number of rows selected
-   *
-   * @return List of row group indexes and its starting row
-   */
-  [[nodiscard]] auto select_row_groups(std::vector<std::vector<size_type>> const& row_groups,
-                                       size_type& row_start,
-                                       size_type& row_count) const
-  {
-    if (!row_groups.empty()) {
-      std::vector<row_group_info> selection;
-      CUDF_EXPECTS(row_groups.size() == per_file_metadata.size(),
-                   "Must specify row groups for each source");
-
-      row_count = 0;
-      for (size_t src_idx = 0; src_idx < row_groups.size(); ++src_idx) {
-        for (auto const& rowgroup_idx : row_groups[src_idx]) {
-          CUDF_EXPECTS(
-            rowgroup_idx >= 0 &&
-              rowgroup_idx < static_cast<size_type>(per_file_metadata[src_idx].row_groups.size()),
-            "Invalid rowgroup index");
-          selection.emplace_back(rowgroup_idx, row_count, src_idx);
-          row_count += get_row_group(rowgroup_idx, src_idx).num_rows;
-        }
-      }
-      return selection;
-    }
-
-    row_start = std::max(row_start, 0);
-    if (row_count < 0) {
-      row_count = static_cast<size_type>(
-        std::min<int64_t>(get_num_rows(), std::numeric_limits<size_type>::max()));
-    }
-    row_count = min(row_count, get_num_rows() - row_start);
-    CUDF_EXPECTS(row_count >= 0, "Invalid row count");
-    CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start");
-
-    std::vector<row_group_info> selection;
-    size_type count = 0;
-    for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) {
-      for (size_t rg_idx = 0; rg_idx < per_file_metadata[src_idx].row_groups.size(); ++rg_idx) {
-        auto const chunk_start_row = count;
-        count += get_row_group(rg_idx, src_idx).num_rows;
-        if (count > row_start || count == 0) {
-          selection.emplace_back(rg_idx, chunk_start_row, src_idx);
-        }
-        if (count >= row_start + row_count) { break; }
-      }
-    }
-
-    return selection;
-  }
-
-  /**
-   * @brief Filters and reduces down to a selection of columns
-   *
-   * @param use_names List of paths of column names to select; `nullopt` if user did not select
-   * columns to read
-   * @param include_index Whether to always include the PANDAS index column(s)
-   * @param strings_to_categorical Type conversion parameter
-   * @param timestamp_type_id Type conversion parameter
-   *
-   * @return input column information, output column information, list of output column schema
-   * indices
-   */
-  [[nodiscard]] auto select_columns(std::optional<std::vector<std::string>> const& use_names,
-                                    bool include_index,
-                                    bool strings_to_categorical,
-                                    type_id timestamp_type_id) const
-  {
-    auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
-      auto const& col_schema_idx = std::find_if(
-        schema_elem.children_idx.cbegin(),
-        schema_elem.children_idx.cend(),
-        [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; });
-
-      return (col_schema_idx != schema_elem.children_idx.end()) ? static_cast<int>(*col_schema_idx)
-                                                                : -1;
-    };
-
-    std::vector<column_buffer> output_columns;
-    std::vector<input_column_info> input_columns;
-    std::vector<int> nesting;
-
-    // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
-    // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
-    // not a child of "struct1" then the function will return false for "struct1"
-    std::function<bool(column_name_info const*, int, std::vector<column_buffer>&, bool)>
-      build_column = [&](column_name_info const* col_name_info,
-                         int schema_idx,
-                         std::vector<column_buffer>& out_col_array,
-                         bool has_list_parent) {
-        if (schema_idx < 0) { return false; }
-        auto const& schema_elem = get_schema(schema_idx);
-
-        // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer
-        // hierarchy. So continue on
-        if (schema_elem.is_stub()) {
-          // is this legit?
-          CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub");
-          auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr;
-          return build_column(
-            child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent);
-        }
-
-        // if we're at the root, this is a new output column
-        auto const col_type =
-          schema_elem.is_one_level_list()
-            ? type_id::LIST
-            : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
-        auto const dtype = to_data_type(col_type, schema_elem);
-
-        column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
-        if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
-        // store the index of this element if inserted in out_col_array
-        nesting.push_back(static_cast<int>(out_col_array.size()));
-        output_col.name = schema_elem.name;
-
-        // build each child
-        bool path_is_valid = false;
-        if (col_name_info == nullptr or col_name_info->children.empty()) {
-          // add all children of schema_elem.
-          // At this point, we can no longer pass a col_name_info to build_column
-          for (int idx = 0; idx < schema_elem.num_children; idx++) {
-            path_is_valid |= build_column(nullptr,
-                                          schema_elem.children_idx[idx],
-                                          output_col.children,
-                                          has_list_parent || col_type == type_id::LIST);
-          }
-        } else {
-          for (size_t idx = 0; idx < col_name_info->children.size(); idx++) {
-            path_is_valid |=
-              build_column(&col_name_info->children[idx],
-                           find_schema_child(schema_elem, col_name_info->children[idx].name),
-                           output_col.children,
-                           has_list_parent || col_type == type_id::LIST);
-          }
-        }
-
-        // if I have no children, we're at a leaf and I'm an input column (that is, one with actual
-        // data stored) so add me to the list.
-        if (schema_elem.num_children == 0) {
-          input_column_info& input_col =
-            input_columns.emplace_back(input_column_info{schema_idx, schema_elem.name});
-
-          // set up child output column for one-level encoding list
-          if (schema_elem.is_one_level_list()) {
-            // determine the element data type
-            auto const element_type =
-              to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
-            auto const element_dtype = to_data_type(element_type, schema_elem);
-
-            column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
-            if (has_list_parent || col_type == type_id::LIST) {
-              element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
-            }
-            // store the index of this element
-            nesting.push_back(static_cast<int>(output_col.children.size()));
-            // TODO: not sure if we should assign a name or leave it blank
-            element_col.name = "element";
-
-            output_col.children.push_back(std::move(element_col));
-          }
-
-          std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
-
-          // pop off the extra nesting element.
-          if (schema_elem.is_one_level_list()) { nesting.pop_back(); }
-
-          path_is_valid = true;  // If we're able to reach leaf then path is valid
-        }
-
-        if (path_is_valid) { out_col_array.push_back(std::move(output_col)); }
-
-        nesting.pop_back();
-        return path_is_valid;
-      };
-
-    std::vector<int> output_column_schemas;
-
-    //
-    // there is not necessarily a 1:1 mapping between input columns and output columns.
-    // For example, parquet does not explicitly store a ColumnChunkDesc for struct columns.
-    // The "structiness" is simply implied by the schema.  For example, this schema:
-    //  required group field_id=1 name {
-    //    required binary field_id=2 firstname (String);
-    //    required binary field_id=3 middlename (String);
-    //    required binary field_id=4 lastname (String);
-    // }
-    // will only contain 3 internal columns of data (firstname, middlename, lastname).  But of
-    // course "name" is ultimately the struct column we want to return.
-    //
-    // "firstname", "middlename" and "lastname" represent the input columns in the file that we
-    // process to produce the final cudf "name" column.
-    //
-    // A user can ask for a single field out of the struct e.g. firstname.
-    // In this case they'll pass a fully qualified name to the schema element like
-    // ["name", "firstname"]
-    //
-    auto const& root = get_schema(0);
-    if (not use_names.has_value()) {
-      for (auto const& schema_idx : root.children_idx) {
-        build_column(nullptr, schema_idx, output_columns, false);
-        output_column_schemas.push_back(schema_idx);
-      }
-    } else {
-      struct path_info {
-        std::string full_path;
-        int schema_idx;
-      };
-
-      // Convert schema into a vector of every possible path
-      std::vector<path_info> all_paths;
-      std::function<void(std::string, int)> add_path = [&](std::string path_till_now,
-                                                           int schema_idx) {
-        auto const& schema_elem = get_schema(schema_idx);
-        std::string curr_path   = path_till_now + schema_elem.name;
-        all_paths.push_back({curr_path, schema_idx});
-        for (auto const& child_idx : schema_elem.children_idx) {
-          add_path(curr_path + ".", child_idx);
-        }
-      };
-      for (auto const& child_idx : get_schema(0).children_idx) {
-        add_path("", child_idx);
-      }
-
-      // Find which of the selected paths are valid and get their schema index
-      std::vector<path_info> valid_selected_paths;
-      for (auto const& selected_path : *use_names) {
-        auto found_path =
-          std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
-            return valid_path.full_path == selected_path;
-          });
-        if (found_path != all_paths.end()) {
-          valid_selected_paths.push_back({selected_path, found_path->schema_idx});
-        }
-      }
-
-      // Now construct paths as vector of strings for further consumption
-      std::vector<std::vector<std::string>> use_names3;
-      std::transform(valid_selected_paths.begin(),
-                     valid_selected_paths.end(),
-                     std::back_inserter(use_names3),
-                     [&](path_info const& valid_path) {
-                       auto schema_idx = valid_path.schema_idx;
-                       std::vector<std::string> result_path;
-                       do {
-                         SchemaElement const& elem = get_schema(schema_idx);
-                         result_path.push_back(elem.name);
-                         schema_idx = elem.parent_idx;
-                       } while (schema_idx > 0);
-                       return std::vector<std::string>(result_path.rbegin(), result_path.rend());
-                     });
-
-      std::vector<column_name_info> selected_columns;
-      if (include_index) {
-        std::vector<std::string> index_names = get_pandas_index_names();
-        std::transform(index_names.cbegin(),
-                       index_names.cend(),
-                       std::back_inserter(selected_columns),
-                       [](std::string const& name) { return column_name_info(name); });
-      }
-      // Merge the vector use_names into a set of hierarchical column_name_info objects
-      /* This is because if we have columns like this:
-       *     col1
-       *      / \
-       *    s3   f4
-       *   / \
-       * f5   f6
-       *
-       * there may be common paths in use_names like:
-       * {"col1", "s3", "f5"}, {"col1", "f4"}
-       * which means we want the output to contain
-       *     col1
-       *      / \
-       *    s3   f4
-       *   /
-       * f5
-       *
-       * rather than
-       *  col1   col1
-       *   |      |
-       *   s3     f4
-       *   |
-       *   f5
-       */
-      for (auto const& path : use_names3) {
-        auto array_to_find_in = &selected_columns;
-        for (size_t depth = 0; depth < path.size(); ++depth) {
-          // Check if the path exists in our selected_columns and if not, add it.
-          auto const& name_to_find = path[depth];
-          auto found_col           = std::find_if(
-            array_to_find_in->begin(),
-            array_to_find_in->end(),
-            [&name_to_find](column_name_info const& col) { return col.name == name_to_find; });
-          if (found_col == array_to_find_in->end()) {
-            auto& col        = array_to_find_in->emplace_back(name_to_find);
-            array_to_find_in = &col.children;
-          } else {
-            // Path exists. go down further.
-            array_to_find_in = &found_col->children;
-          }
-        }
-      }
-      for (auto& col : selected_columns) {
-        auto const& top_level_col_schema_idx = find_schema_child(root, col.name);
-        bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false);
-        if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx);
-      }
-    }
-
-    return std::make_tuple(
-      std::move(input_columns), std::move(output_columns), std::move(output_column_schemas));
-  }
-};
-
-/**
- * @brief Generate depth remappings for repetition and definition levels.
- *
- * When dealing with columns that contain lists, we must examine incoming
- * repetition and definition level pairs to determine what range of output nesting
- * is indicated when adding new values.  This function generates the mappings of
- * the R/D levels to those start/end bounds
- *
- * @param remap Maps column schema index to the R/D remapping vectors for that column
- * @param src_col_schema The column schema to generate the new mapping for
- * @param md File metadata information
- */
-void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
-                               int src_col_schema,
-                               aggregate_reader_metadata const& md)
-{
-  // already generated for this level
-  if (remap.find(src_col_schema) != remap.end()) { return; }
-  auto schema   = md.get_schema(src_col_schema);
-  int max_depth = md.get_output_nesting_depth(src_col_schema);
-
-  CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(),
-               "Attempting to remap a schema more than once");
-  auto inserted =
-    remap.insert(std::pair<int, std::pair<std::vector<int>, std::vector<int>>>{src_col_schema, {}});
-  auto& depth_remap = inserted.first->second;
-
-  std::vector<int>& rep_depth_remap = (depth_remap.first);
-  rep_depth_remap.resize(schema.max_repetition_level + 1);
-  std::vector<int>& def_depth_remap = (depth_remap.second);
-  def_depth_remap.resize(schema.max_definition_level + 1);
-
-  // the key:
-  // for incoming level values  R/D
-  // add values starting at the shallowest nesting level X has repetition level R
-  // until you reach the deepest nesting level Y that corresponds to the repetition level R1
-  // held by the nesting level that has definition level D
-  //
-  // Example: a 3 level struct with a list at the bottom
-  //
-  //                     R / D   Depth
-  // level0              0 / 1     0
-  //   level1            0 / 2     1
-  //     level2          0 / 3     2
-  //       list          0 / 3     3
-  //         element     1 / 4     4
-  //
-  // incoming R/D : 0, 0  -> add values from depth 0 to 3   (def level 0 always maps to depth 0)
-  // incoming R/D : 0, 1  -> add values from depth 0 to 3
-  // incoming R/D : 0, 2  -> add values from depth 0 to 3
-  // incoming R/D : 1, 4  -> add values from depth 4 to 4
-  //
-  // Note : the -validity- of values is simply checked by comparing the incoming D value against the
-  // D value of the given nesting level (incoming D >= the D for the nesting level == valid,
-  // otherwise NULL).  The tricky part is determining what nesting levels to add values at.
-  //
-  // For schemas with no repetition level (no lists), X is always 0 and Y is always max nesting
-  // depth.
-  //
-
-  // compute "X" from above
-  for (int s_idx = schema.max_repetition_level; s_idx >= 0; s_idx--) {
-    auto find_shallowest = [&](int r) {
-      int shallowest = -1;
-      int cur_depth  = max_depth - 1;
-      int schema_idx = src_col_schema;
-      while (schema_idx > 0) {
-        auto cur_schema = md.get_schema(schema_idx);
-        if (cur_schema.max_repetition_level == r) {
-          // if this is a repeated field, map it one level deeper
-          shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth;
-        }
-        // if it's one-level encoding list
-        else if (cur_schema.is_one_level_list()) {
-          shallowest = cur_depth - 1;
-        }
-        if (!cur_schema.is_stub()) { cur_depth--; }
-        schema_idx = cur_schema.parent_idx;
-      }
-      return shallowest;
-    };
-    rep_depth_remap[s_idx] = find_shallowest(s_idx);
-  }
-
-  // compute "Y" from above
-  for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) {
-    auto find_deepest = [&](int d) {
-      SchemaElement prev_schema;
-      int schema_idx = src_col_schema;
-      int r1         = 0;
-      while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
-        if (cur_schema.max_definition_level == d) {
-          // if this is a repeated field, map it one level deeper
-          r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level
-                                    : cur_schema.max_repetition_level;
-          break;
-        }
-        prev_schema = cur_schema;
-        schema_idx  = cur_schema.parent_idx;
-      }
-
-      // we now know R1 from above. return the deepest nesting level that has the
-      // same repetition level
-      schema_idx = src_col_schema;
-      int depth  = max_depth - 1;
-      while (schema_idx > 0) {
-        SchemaElement cur_schema = md.get_schema(schema_idx);
-        if (cur_schema.max_repetition_level == r1) {
-          // if this is a repeated field, map it one level deeper
-          depth = cur_schema.is_stub() ? depth + 1 : depth;
-          break;
-        }
-        if (!cur_schema.is_stub()) { depth--; }
-        prev_schema = cur_schema;
-        schema_idx  = cur_schema.parent_idx;
-      }
-      return depth;
-    };
-    def_depth_remap[s_idx] = find_deepest(s_idx);
-  }
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::read_column_chunks
- */
-std::future<void> reader::impl::read_column_chunks(
-  std::vector<std::unique_ptr<datasource::buffer>>& page_data,
-  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,  // TODO const?
-  size_t begin_chunk,
-  size_t end_chunk,
-  const std::vector<size_t>& column_chunk_offsets,
-  std::vector<size_type> const& chunk_source_map)
-{
-  // Transfer chunk data, coalescing adjacent chunks
-  std::vector<std::future<size_t>> read_tasks;
-  for (size_t chunk = begin_chunk; chunk < end_chunk;) {
-    const size_t io_offset   = column_chunk_offsets[chunk];
-    size_t io_size           = chunks[chunk].compressed_size;
-    size_t next_chunk        = chunk + 1;
-    const bool is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
-    while (next_chunk < end_chunk) {
-      const size_t next_offset = column_chunk_offsets[next_chunk];
-      const bool is_next_compressed =
-        (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED);
-      if (next_offset != io_offset + io_size || is_next_compressed != is_compressed) {
-        // Can't merge if not contiguous or mixing compressed and uncompressed
-        // Not coalescing uncompressed with compressed chunks is so that compressed buffers can be
-        // freed earlier (immediately after decompression stage) to limit peak memory requirements
-        break;
-      }
-      io_size += chunks[next_chunk].compressed_size;
-      next_chunk++;
-    }
-    if (io_size != 0) {
-      auto& source = _sources[chunk_source_map[chunk]];
-      if (source->is_device_read_preferred(io_size)) {
-        auto buffer        = rmm::device_buffer(io_size, _stream);
-        auto fut_read_size = source->device_read_async(
-          io_offset, io_size, static_cast<uint8_t*>(buffer.data()), _stream);
-        read_tasks.emplace_back(std::move(fut_read_size));
-        page_data[chunk] = datasource::buffer::create(std::move(buffer));
-      } else {
-        auto const buffer = source->host_read(io_offset, io_size);
-        page_data[chunk] =
-          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), _stream));
-      }
-      auto d_compdata = page_data[chunk]->data();
-      do {
-        chunks[chunk].compressed_data = d_compdata;
-        d_compdata += chunks[chunk].compressed_size;
-      } while (++chunk != next_chunk);
-    } else {
-      chunk = next_chunk;
-    }
-  }
-  auto sync_fn = [](decltype(read_tasks) read_tasks) {
-    for (auto& task : read_tasks) {
-      task.wait();
-    }
-  };
-  return std::async(std::launch::deferred, sync_fn, std::move(read_tasks));
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::count_page_headers
- */
-size_t reader::impl::count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks)
-{
-  size_t total_pages = 0;
-
-  chunks.host_to_device(_stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), _stream);
-  chunks.device_to_host(_stream, true);
-
-  for (size_t c = 0; c < chunks.size(); c++) {
-    total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages;
-  }
-
-  return total_pages;
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::decode_page_headers
- */
-void reader::impl::decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                       hostdevice_vector<gpu::PageInfo>& pages)
-{
-  // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
-  // please update preprocess_nested_columns to reflect this.
-  for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-    chunks[c].max_num_pages = chunks[c].num_data_pages + chunks[c].num_dict_pages;
-    chunks[c].page_info     = pages.device_ptr(page_count);
-    page_count += chunks[c].max_num_pages;
-  }
-
-  chunks.host_to_device(_stream);
-  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), _stream);
-  pages.device_to_host(_stream, true);
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::decompress_page_data
- */
-rmm::device_buffer reader::impl::decompress_page_data(
-  hostdevice_vector<gpu::ColumnChunkDesc>& chunks, hostdevice_vector<gpu::PageInfo>& pages)
-{
-  auto for_each_codec_page = [&](parquet::Compression codec, const std::function<void(size_t)>& f) {
-    for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-      const auto page_stride = chunks[c].max_num_pages;
-      if (chunks[c].codec == codec) {
-        for (int k = 0; k < page_stride; k++) {
-          f(page_count + k);
-        }
-      }
-      page_count += page_stride;
-    }
-  };
-
-  // Brotli scratch memory for decompressing
-  rmm::device_buffer debrotli_scratch;
-
-  // Count the exact number of compressed pages
-  size_t num_comp_pages    = 0;
-  size_t total_decomp_size = 0;
-
-  struct codec_stats {
-    parquet::Compression compression_type = UNCOMPRESSED;
-    size_t num_pages                      = 0;
-    int32_t max_decompressed_size         = 0;
-    size_t total_decomp_size              = 0;
-  };
-
-  std::array codecs{codec_stats{parquet::GZIP},
-                    codec_stats{parquet::SNAPPY},
-                    codec_stats{parquet::BROTLI},
-                    codec_stats{parquet::ZSTD}};
-
-  auto is_codec_supported = [&codecs](int8_t codec) {
-    if (codec == parquet::UNCOMPRESSED) return true;
-    return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
-             return codec == cstats.compression_type;
-           }) != codecs.end();
-  };
-  CUDF_EXPECTS(std::all_of(chunks.begin(),
-                           chunks.end(),
-                           [&is_codec_supported](auto const& chunk) {
-                             return is_codec_supported(chunk.codec);
-                           }),
-               "Unsupported compression type");
-
-  for (auto& codec : codecs) {
-    for_each_codec_page(codec.compression_type, [&](size_t page) {
-      auto page_uncomp_size = pages[page].uncompressed_page_size;
-      total_decomp_size += page_uncomp_size;
-      codec.total_decomp_size += page_uncomp_size;
-      codec.max_decompressed_size = std::max(codec.max_decompressed_size, page_uncomp_size);
-      codec.num_pages++;
-      num_comp_pages++;
-    });
-    if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) {
-      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), _stream);
-    }
-  }
-
-  // Dispatch batches of pages to decompress for each codec
-  rmm::device_buffer decomp_pages(total_decomp_size, _stream);
-
-  std::vector<device_span<uint8_t const>> comp_in;
-  comp_in.reserve(num_comp_pages);
-  std::vector<device_span<uint8_t>> comp_out;
-  comp_out.reserve(num_comp_pages);
-
-  rmm::device_uvector<compression_result> comp_res(num_comp_pages, _stream);
-  thrust::fill(rmm::exec_policy(_stream),
-               comp_res.begin(),
-               comp_res.end(),
-               compression_result{0, compression_status::FAILURE});
-
-  size_t decomp_offset = 0;
-  int32_t start_pos    = 0;
-  for (const auto& codec : codecs) {
-    if (codec.num_pages == 0) { continue; }
-
-    for_each_codec_page(codec.compression_type, [&](size_t page) {
-      auto dst_base = static_cast<uint8_t*>(decomp_pages.data());
-      comp_in.emplace_back(pages[page].page_data,
-                           static_cast<size_t>(pages[page].compressed_page_size));
-      comp_out.emplace_back(dst_base + decomp_offset,
-                            static_cast<size_t>(pages[page].uncompressed_page_size));
-
-      pages[page].page_data = static_cast<uint8_t*>(comp_out.back().data());
-      decomp_offset += comp_out.back().size();
-    });
-
-    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
-                                                             codec.num_pages};
-    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, _stream);
-    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
-                                                        codec.num_pages);
-    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, _stream);
-    device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
-
-    switch (codec.compression_type) {
-      case parquet::GZIP:
-        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, _stream);
-        break;
-      case parquet::SNAPPY:
-        if (nvcomp_integration::is_stable_enabled()) {
-          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
-                                     d_comp_in,
-                                     d_comp_out,
-                                     d_comp_res_view,
-                                     codec.max_decompressed_size,
-                                     codec.total_decomp_size,
-                                     _stream);
-        } else {
-          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, _stream);
-        }
-        break;
-      case parquet::ZSTD:
-        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
-                                   d_comp_in,
-                                   d_comp_out,
-                                   d_comp_res_view,
-                                   codec.max_decompressed_size,
-                                   codec.total_decomp_size,
-                                   _stream);
-        break;
-      case parquet::BROTLI:
-        gpu_debrotli(d_comp_in,
-                     d_comp_out,
-                     d_comp_res_view,
-                     debrotli_scratch.data(),
-                     debrotli_scratch.size(),
-                     _stream);
-        break;
-      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
-    }
-    start_pos += codec.num_pages;
-  }
-
-  decompress_check(comp_res, _stream);
-
-  // Update the page information in device memory with the updated value of
-  // page_data; it now points to the uncompressed data buffer
-  pages.host_to_device(_stream);
-
-  return decomp_pages;
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::allocate_nesting_info
- */
-void reader::impl::allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
-                                         hostdevice_vector<gpu::PageInfo>& pages,
-                                         hostdevice_vector<gpu::PageNestingInfo>& page_nesting_info)
-{
-  // compute total # of page_nesting infos needed and allocate space. doing this in one
-  // buffer to keep it to a single gpu allocation
-  size_t const total_page_nesting_infos = std::accumulate(
-    chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto& chunk) {
-      // the schema of the input column
-      auto const& schema                    = _metadata->get_schema(chunk.src_col_schema);
-      auto const per_page_nesting_info_size = max(
-        schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema));
-      return total + (per_page_nesting_info_size * chunk.num_data_pages);
-    });
-
-  page_nesting_info = hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
-
-  // retrieve from the gpu so we can update
-  pages.device_to_host(_stream, true);
-
-  // update pointers in the PageInfos
-  int target_page_index = 0;
-  int src_info_index    = 0;
-  for (size_t idx = 0; idx < chunks.size(); idx++) {
-    int src_col_schema                    = chunks[idx].src_col_schema;
-    auto& schema                          = _metadata->get_schema(src_col_schema);
-    auto const per_page_nesting_info_size = std::max(
-      schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
-
-    // skip my dict pages
-    target_page_index += chunks[idx].num_dict_pages;
-    for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
-      pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index;
-      pages[target_page_index + p_idx].num_nesting_levels = per_page_nesting_info_size;
-
-      src_info_index += per_page_nesting_info_size;
-    }
-    target_page_index += chunks[idx].num_data_pages;
-  }
-
-  // copy back to the gpu
-  pages.host_to_device(_stream);
-
-  // fill in
-  int nesting_info_index = 0;
-  std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-  for (size_t idx = 0; idx < chunks.size(); idx++) {
-    int src_col_schema = chunks[idx].src_col_schema;
-
-    // schema of the input column
-    auto& schema = _metadata->get_schema(src_col_schema);
-    // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
-    int max_depth = _metadata->get_output_nesting_depth(src_col_schema);
-
-    // # of nesting infos stored per page for this column
-    auto const per_page_nesting_info_size = std::max(schema.max_definition_level + 1, max_depth);
-
-    // if this column has lists, generate depth remapping
-    std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
-    if (schema.max_repetition_level > 0) {
-      generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
-    }
-
-    // fill in host-side nesting info
-    int schema_idx  = src_col_schema;
-    auto cur_schema = _metadata->get_schema(schema_idx);
-    int cur_depth   = max_depth - 1;
-    while (schema_idx > 0) {
-      // stub columns (basically the inner field of a list scheme element) are not real columns.
-      // we can ignore them for the purposes of output nesting info
-      if (!cur_schema.is_stub()) {
-        // initialize each page within the chunk
-        for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
-          gpu::PageNestingInfo* pni =
-            &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
-
-          // if we have lists, set our start and end depth remappings
-          if (schema.max_repetition_level > 0) {
-            auto remap = depth_remapping.find(src_col_schema);
-            CUDF_EXPECTS(remap != depth_remapping.end(),
-                         "Could not find depth remapping for schema");
-            std::vector<int> const& rep_depth_remap = (remap->second.first);
-            std::vector<int> const& def_depth_remap = (remap->second.second);
-
-            for (size_t m = 0; m < rep_depth_remap.size(); m++) {
-              pni[m].start_depth = rep_depth_remap[m];
-            }
-            for (size_t m = 0; m < def_depth_remap.size(); m++) {
-              pni[m].end_depth = def_depth_remap[m];
-            }
-          }
-
-          // values indexed by output column index
-          pni[cur_depth].max_def_level = cur_schema.max_definition_level;
-          pni[cur_depth].max_rep_level = cur_schema.max_repetition_level;
-          pni[cur_depth].size          = 0;
-        }
-
-        // move up the hierarchy
-        cur_depth--;
-      }
-
-      // next schema
-      schema_idx = cur_schema.parent_idx;
-      cur_schema = _metadata->get_schema(schema_idx);
-    }
-
-    nesting_info_index += (per_page_nesting_info_size * chunks[idx].num_data_pages);
-  }
-
-  // copy nesting info to the device
-  page_nesting_info.host_to_device(_stream);
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::preprocess_columns
- */
-void reader::impl::preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                      hostdevice_vector<gpu::PageInfo>& pages,
-                                      size_t min_row,
-                                      size_t total_rows,
-                                      bool uses_custom_row_bounds,
-                                      bool has_lists)
-{
-  // TODO : we should be selectively preprocessing only columns that have
-  // lists in them instead of doing them all if even one contains lists.
-
-  // if there are no lists, simply allocate every allocate every output
-  // column to be of size num_rows
-  if (!has_lists) {
-    std::function<void(std::vector<column_buffer>&)> create_columns =
-      [&](std::vector<column_buffer>& cols) {
-        for (size_t idx = 0; idx < cols.size(); idx++) {
-          auto& col = cols[idx];
-          col.create(total_rows, _stream, _mr);
-          create_columns(col.children);
-        }
-      };
-    create_columns(_output_columns);
-  } else {
-    // preprocess per-nesting level sizes by page
-    gpu::PreprocessColumnData(pages,
-                              chunks,
-                              _input_columns,
-                              _output_columns,
-                              total_rows,
-                              min_row,
-                              uses_custom_row_bounds,
-                              _stream,
-                              _mr);
-    _stream.synchronize();
-  }
-}
-
-/**
- * @copydoc cudf::io::detail::parquet::decode_page_data
- */
-void reader::impl::decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                    hostdevice_vector<gpu::PageInfo>& pages,
-                                    hostdevice_vector<gpu::PageNestingInfo>& page_nesting,
-                                    size_t min_row,
-                                    size_t total_rows)
-{
-  auto is_dict_chunk = [](const gpu::ColumnChunkDesc& chunk) {
-    return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
-  };
-
-  // Count the number of string dictionary entries
-  // NOTE: Assumes first page in the chunk is always the dictionary page
-  size_t total_str_dict_indexes = 0;
-  for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
-    if (is_dict_chunk(chunks[c])) { total_str_dict_indexes += pages[page_count].num_input_values; }
-    page_count += chunks[c].max_num_pages;
-  }
-
-  // Build index for string dictionaries since they can't be indexed
-  // directly due to variable-sized elements
-  auto str_dict_index = cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(
-    total_str_dict_indexes, _stream);
-
-  // TODO (dm): hd_vec should have begin and end iterator members
-  size_t sum_max_depths =
-    std::accumulate(chunks.host_ptr(),
-                    chunks.host_ptr(chunks.size()),
-                    0,
-                    [&](size_t cursum, gpu::ColumnChunkDesc const& chunk) {
-                      return cursum + _metadata->get_output_nesting_depth(chunk.src_col_schema);
-                    });
-
-  // In order to reduce the number of allocations of hostdevice_vector, we allocate a single vector
-  // to store all per-chunk pointers to nested data/nullmask. `chunk_offsets[i]` will store the
-  // offset into `chunk_nested_data`/`chunk_nested_valids` for the array of pointers for chunk `i`
-  auto chunk_nested_valids = hostdevice_vector<uint32_t*>(sum_max_depths, _stream);
-  auto chunk_nested_data   = hostdevice_vector<void*>(sum_max_depths, _stream);
-  auto chunk_offsets       = std::vector<size_t>();
-
-  // Update chunks with pointers to column data.
-  for (size_t c = 0, page_count = 0, str_ofs = 0, chunk_off = 0; c < chunks.size(); c++) {
-    input_column_info const& input_col = _input_columns[chunks[c].src_col_index];
-    CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
-                 "Column/page schema index mismatch");
-
-    if (is_dict_chunk(chunks[c])) {
-      chunks[c].str_dict_index = str_dict_index.data() + str_ofs;
-      str_ofs += pages[page_count].num_input_values;
-    }
-
-    size_t max_depth = _metadata->get_output_nesting_depth(chunks[c].src_col_schema);
-    chunk_offsets.push_back(chunk_off);
-
-    // get a slice of size `nesting depth` from `chunk_nested_valids` to store an array of pointers
-    // to validity data
-    auto valids              = chunk_nested_valids.host_ptr(chunk_off);
-    chunks[c].valid_map_base = chunk_nested_valids.device_ptr(chunk_off);
-
-    // get a slice of size `nesting depth` from `chunk_nested_data` to store an array of pointers to
-    // out data
-    auto data                  = chunk_nested_data.host_ptr(chunk_off);
-    chunks[c].column_data_base = chunk_nested_data.device_ptr(chunk_off);
-
-    chunk_off += max_depth;
-
-    // fill in the arrays on the host.  there are some important considerations to
-    // take into account here for nested columns.  specifically, with structs
-    // there is sharing of output buffers between input columns.  consider this schema
-    //
-    //  required group field_id=1 name {
-    //    required binary field_id=2 firstname (String);
-    //    required binary field_id=3 middlename (String);
-    //    required binary field_id=4 lastname (String);
-    // }
-    //
-    // there are 3 input columns of data here (firstname, middlename, lastname), but
-    // only 1 output column (name).  The structure of the output column buffers looks like
-    // the schema itself
-    //
-    // struct      (name)
-    //     string  (firstname)
-    //     string  (middlename)
-    //     string  (lastname)
-    //
-    // The struct column can contain validity information. the problem is, the decode
-    // step for the input columns will all attempt to decode this validity information
-    // because each one has it's own copy of the repetition/definition levels. but
-    // since this is all happening in parallel it would mean multiple blocks would
-    // be stomping all over the same memory randomly.  to work around this, we set
-    // things up so that only 1 child of any given nesting level fills in the
-    // data (offsets in the case of lists) or validity information for the higher
-    // levels of the hierarchy that are shared.  In this case, it would mean we
-    // would just choose firstname to be the one that decodes the validity for name.
-    //
-    // we do this by only handing out the pointers to the first child we come across.
-    //
-    auto* cols = &_output_columns;
-    for (size_t idx = 0; idx < max_depth; idx++) {
-      auto& out_buf = (*cols)[input_col.nesting[idx]];
-      cols          = &out_buf.children;
-
-      int owning_schema = out_buf.user_data & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
-      if (owning_schema == 0 || owning_schema == input_col.schema_idx) {
-        valids[idx] = out_buf.null_mask();
-        data[idx]   = out_buf.data();
-        out_buf.user_data |=
-          static_cast<uint32_t>(input_col.schema_idx) & PARQUET_COLUMN_BUFFER_SCHEMA_MASK;
-      } else {
-        valids[idx] = nullptr;
-        data[idx]   = nullptr;
-      }
-    }
-
-    // column_data_base will always point to leaf data, even for nested types.
-    page_count += chunks[c].max_num_pages;
-  }
-
-  chunks.host_to_device(_stream);
-  chunk_nested_valids.host_to_device(_stream);
-  chunk_nested_data.host_to_device(_stream);
-
-  if (total_str_dict_indexes > 0) {
-    gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
-  }
-
-  gpu::DecodePageData(pages, chunks, total_rows, min_row, _stream);
-  pages.device_to_host(_stream);
-  page_nesting.device_to_host(_stream);
-  _stream.synchronize();
-
-  // for list columns, add the final offset to every offset buffer.
-  // TODO : make this happen in more efficiently. Maybe use thrust::for_each
-  // on each buffer.  Or potentially do it in PreprocessColumnData
-  // Note : the reason we are doing this here instead of in the decode kernel is
-  // that it is difficult/impossible for a given page to know that it is writing the very
-  // last value that should then be followed by a terminator (because rows can span
-  // page boundaries).
-  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
-    input_column_info const& input_col = _input_columns[idx];
-
-    auto* cols = &_output_columns;
-    for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
-      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
-      cols          = &out_buf.children;
-
-      if (out_buf.type.id() != type_id::LIST ||
-          (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED)) {
-        continue;
-      }
-      CUDF_EXPECTS(l_idx < input_col.nesting_depth() - 1, "Encountered a leaf list column");
-      auto& child = (*cols)[input_col.nesting[l_idx + 1]];
-
-      // the final offset for a list at level N is the size of it's child
-      int offset = child.type.id() == type_id::LIST ? child.size - 1 : child.size;
-      cudaMemcpyAsync(static_cast<int32_t*>(out_buf.data()) + (out_buf.size - 1),
-                      &offset,
-                      sizeof(offset),
-                      cudaMemcpyHostToDevice,
-                      _stream.value());
-      out_buf.user_data |= PARQUET_COLUMN_BUFFER_FLAG_LIST_TERMINATED;
-    }
-  }
-
-  // update null counts in the final column buffers
-  for (size_t idx = 0; idx < pages.size(); idx++) {
-    gpu::PageInfo* pi = &pages[idx];
-    if (pi->flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
-    gpu::ColumnChunkDesc* col          = &chunks[pi->chunk_idx];
-    input_column_info const& input_col = _input_columns[col->src_col_index];
-
-    int index                 = pi->nesting - page_nesting.device_ptr();
-    gpu::PageNestingInfo* pni = &page_nesting[index];
-
-    auto* cols = &_output_columns;
-    for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
-      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
-      cols          = &out_buf.children;
-
-      // if I wasn't the one who wrote out the validity bits, skip it
-      if (chunk_nested_valids.host_ptr(chunk_offsets[pi->chunk_idx])[l_idx] == nullptr) {
-        continue;
-      }
-      out_buf.null_count() += pni[l_idx].null_count;
-    }
-  }
-
-  _stream.synchronize();
-}
-
-reader::impl::impl(std::vector<std::unique_ptr<datasource>>&& sources,
-                   parquet_reader_options const& options,
-                   rmm::cuda_stream_view stream,
-                   rmm::mr::device_memory_resource* mr)
-  : _stream(stream), _mr(mr), _sources(std::move(sources))
-{
-  // Open and parse the source dataset metadata
-  _metadata = std::make_unique<aggregate_reader_metadata>(_sources);
-
-  // Override output timestamp resolution if requested
-  if (options.get_timestamp_type().id() != type_id::EMPTY) {
-    _timestamp_type = options.get_timestamp_type();
-  }
-
-  // Strings may be returned as either string or categorical columns
-  _strings_to_categorical = options.is_enabled_convert_strings_to_categories();
-
-  // Binary columns can be read as binary or strings
-  _reader_column_schema = options.get_column_schema();
-
-  // Select only columns required by the options
-  std::tie(_input_columns, _output_columns, _output_column_schemas) =
-    _metadata->select_columns(options.get_columns(),
-                              options.is_enabled_use_pandas_metadata(),
-                              _strings_to_categorical,
-                              _timestamp_type.id());
-}
-
-table_with_metadata reader::impl::read(size_type skip_rows,
-                                       size_type num_rows,
-                                       bool uses_custom_row_bounds,
-                                       std::vector<std::vector<size_type>> const& row_group_list)
-{
-  // Select only row groups required
-  const auto selected_row_groups =
-    _metadata->select_row_groups(row_group_list, skip_rows, num_rows);
-
-  table_metadata out_metadata;
-
-  // output cudf columns as determined by the top level schema
-  std::vector<std::unique_ptr<column>> out_columns;
-  out_columns.reserve(_output_columns.size());
-
-  if (selected_row_groups.size() != 0 && _input_columns.size() != 0) {
-    // Descriptors for all the chunks that make up the selected columns
-    const auto num_input_columns = _input_columns.size();
-    const auto num_chunks        = selected_row_groups.size() * num_input_columns;
-    hostdevice_vector<gpu::ColumnChunkDesc> chunks(0, num_chunks, _stream);
-
-    // Association between each column chunk and its source
-    std::vector<size_type> chunk_source_map(num_chunks);
-
-    // Tracker for eventually deallocating compressed and uncompressed data
-    std::vector<std::unique_ptr<datasource::buffer>> page_data(num_chunks);
-
-    // Keep track of column chunk file offsets
-    std::vector<size_t> column_chunk_offsets(num_chunks);
-
-    // if there are lists present, we need to preprocess
-    bool has_lists = false;
-
-    // Initialize column chunk information
-    size_t total_decompressed_size = 0;
-    auto remaining_rows            = num_rows;
-    std::vector<std::future<void>> read_rowgroup_tasks;
-    for (const auto& rg : selected_row_groups) {
-      const auto& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
-      auto const row_group_start  = rg.start_row;
-      auto const row_group_source = rg.source_index;
-      auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
-      auto const io_chunk_idx     = chunks.size();
-
-      // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
-      for (size_t i = 0; i < num_input_columns; ++i) {
-        auto col = _input_columns[i];
-        // look up metadata
-        auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
-        auto& schema   = _metadata->get_schema(col.schema_idx);
-
-        // this column contains repetition levels and will require a preprocess
-        if (schema.max_repetition_level > 0) { has_lists = true; }
-
-        auto [type_width, clock_rate, converted_type] =
-          conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
-                          _timestamp_type.id(),
-                          schema.type,
-                          schema.converted_type,
-                          schema.type_length);
-
-        column_chunk_offsets[chunks.size()] =
-          (col_meta.dictionary_page_offset != 0)
-            ? std::min(col_meta.data_page_offset, col_meta.dictionary_page_offset)
-            : col_meta.data_page_offset;
-
-        chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
-                                              nullptr,
-                                              col_meta.num_values,
-                                              schema.type,
-                                              type_width,
-                                              row_group_start,
-                                              row_group_rows,
-                                              schema.max_definition_level,
-                                              schema.max_repetition_level,
-                                              _metadata->get_output_nesting_depth(col.schema_idx),
-                                              required_bits(schema.max_definition_level),
-                                              required_bits(schema.max_repetition_level),
-                                              col_meta.codec,
-                                              converted_type,
-                                              schema.logical_type,
-                                              schema.decimal_scale,
-                                              clock_rate,
-                                              i,
-                                              col.schema_idx));
-
-        // Map each column chunk to its column index and its source index
-        chunk_source_map[chunks.size() - 1] = row_group_source;
-
-        if (col_meta.codec != Compression::UNCOMPRESSED) {
-          total_decompressed_size += col_meta.total_uncompressed_size;
-        }
-      }
-      // Read compressed chunk data to device memory
-      read_rowgroup_tasks.push_back(read_column_chunks(
-        page_data, chunks, io_chunk_idx, chunks.size(), column_chunk_offsets, chunk_source_map));
-
-      remaining_rows -= row_group.num_rows;
-    }
-    for (auto& task : read_rowgroup_tasks) {
-      task.wait();
-    }
-    assert(remaining_rows <= 0);
-
-    // Process dataset chunk pages into output columns
-    const auto total_pages = count_page_headers(chunks);
-    if (total_pages > 0) {
-      hostdevice_vector<gpu::PageInfo> pages(total_pages, total_pages, _stream);
-      rmm::device_buffer decomp_page_data;
-
-      // decoding of column/page information
-      decode_page_headers(chunks, pages);
-      if (total_decompressed_size > 0) {
-        decomp_page_data = decompress_page_data(chunks, pages);
-        // Free compressed data
-        for (size_t c = 0; c < chunks.size(); c++) {
-          if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) { page_data[c].reset(); }
-        }
-      }
-
-      // build output column info
-      // walk the schema, building out_buffers that mirror what our final cudf columns will look
-      // like. important : there is not necessarily a 1:1 mapping between input columns and output
-      // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct
-      // columns. The "structiness" is simply implied by the schema.  For example, this schema:
-      //  required group field_id=1 name {
-      //    required binary field_id=2 firstname (String);
-      //    required binary field_id=3 middlename (String);
-      //    required binary field_id=4 lastname (String);
-      // }
-      // will only contain 3 columns of data (firstname, middlename, lastname).  But of course
-      // "name" is a struct column that we want to return, so we have to make sure that we
-      // create it ourselves.
-      // std::vector<output_column_info> output_info = build_output_column_info();
-
-      // nesting information (sizes, etc) stored -per page-
-      // note : even for flat schemas, we allocate 1 level of "nesting" info
-      hostdevice_vector<gpu::PageNestingInfo> page_nesting_info;
-      allocate_nesting_info(chunks, pages, page_nesting_info);
-
-      // - compute column sizes and allocate output buffers.
-      //   important:
-      //   for nested schemas, we have to do some further preprocessing to determine:
-      //    - real column output sizes per level of nesting (in a flat schema, there's only 1 level
-      //    of
-      //      nesting and it's size is the row count)
-      //
-      // - for nested schemas, output buffer offset values per-page, per nesting-level for the
-      // purposes of decoding.
-      preprocess_columns(chunks, pages, skip_rows, num_rows, uses_custom_row_bounds, has_lists);
-
-      // decoding of column data itself
-      decode_page_data(chunks, pages, page_nesting_info, skip_rows, num_rows);
-
-      // create the final output cudf columns
-      for (size_t i = 0; i < _output_columns.size(); ++i) {
-        column_name_info& col_name = out_metadata.schema_info.emplace_back("");
-        auto const metadata =
-          _reader_column_schema.has_value()
-            ? std::make_optional<reader_column_schema>((*_reader_column_schema)[i])
-            : std::nullopt;
-        out_columns.emplace_back(
-          make_column(_output_columns[i], &col_name, metadata, _stream, _mr));
-      }
-    }
-  }
-
-  // Create empty columns as needed (this can happen if we've ended up with no actual data to read)
-  for (size_t i = out_columns.size(); i < _output_columns.size(); ++i) {
-    column_name_info& col_name = out_metadata.schema_info.emplace_back("");
-    out_columns.emplace_back(io::detail::empty_like(_output_columns[i], &col_name, _stream, _mr));
-  }
-
-  // Return column names (must match order of returned columns)
-  out_metadata.column_names.resize(_output_columns.size());
-  for (size_t i = 0; i < _output_column_schemas.size(); i++) {
-    auto const& schema           = _metadata->get_schema(_output_column_schemas[i]);
-    out_metadata.column_names[i] = schema.name;
-  }
-
-  // Return user metadata
-  out_metadata.per_file_user_data = _metadata->get_key_value_metadata();
-  out_metadata.user_data          = {out_metadata.per_file_user_data[0].begin(),
-                            out_metadata.per_file_user_data[0].end()};
-
-  return {std::make_unique<table>(std::move(out_columns)), std::move(out_metadata)};
-}
-
-// Forward to implementation
-reader::reader(std::vector<std::unique_ptr<cudf::io::datasource>>&& sources,
-               parquet_reader_options const& options,
-               rmm::cuda_stream_view stream,
-               rmm::mr::device_memory_resource* mr)
-  : _impl(std::make_unique<impl>(std::move(sources), options, stream, mr))
-{
-}
-
-// Destructor within this translation unit
-reader::~reader() = default;
-
-// Forward to implementation
-table_with_metadata reader::read(parquet_reader_options const& options)
-{
-  // if the user has specified custom row bounds
-  bool const uses_custom_row_bounds = options.get_num_rows() >= 0 || options.get_skip_rows() != 0;
-  return _impl->read(options.get_skip_rows(),
-                     options.get_num_rows(),
-                     uses_custom_row_bounds,
-                     options.get_row_groups());
-}
-
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
diff --git a/cpp/src/io/parquet/reader_impl.hpp b/cpp/src/io/parquet/reader_impl.hpp
index e1f275bb8e8..6d42e9fab84 100644
--- a/cpp/src/io/parquet/reader_impl.hpp
+++ b/cpp/src/io/parquet/reader_impl.hpp
@@ -21,33 +21,23 @@
 
 #pragma once
 
-#include "parquet.hpp"
 #include "parquet_gpu.hpp"
+#include "reader_impl_helpers.hpp"
 
 #include <io/utilities/column_buffer.hpp>
-#include <io/utilities/hostdevice_vector.hpp>
 
 #include <cudf/io/datasource.hpp>
 #include <cudf/io/detail/parquet.hpp>
 #include <cudf/io/parquet.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
 
 #include <memory>
-#include <string>
-#include <utility>
+#include <optional>
 #include <vector>
 
-namespace cudf {
-namespace io {
-namespace detail {
-namespace parquet {
-using namespace cudf::io::parquet;
-using namespace cudf::io;
-
-// Forward declarations
-class aggregate_reader_metadata;
-
+namespace cudf::io::detail::parquet {
 /**
  * @brief Implementation for Parquet reader
  */
@@ -56,6 +46,9 @@ class reader::impl {
   /**
    * @brief Constructor from an array of dataset sources with reader options.
    *
+   * By using this constructor, each call to `read()` or `read_chunk()` will perform reading the
+   * entire given file.
+   *
    * @param sources Dataset sources
    * @param options Settings for controlling reading behavior
    * @param stream CUDA stream used for device memory operations and kernel launches
@@ -71,8 +64,8 @@ class reader::impl {
    *
    * @param skip_rows Number of rows to skip from the start
    * @param num_rows Number of rows to read
-   * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific
-   * bounds
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
    * @param row_group_indices Lists of row groups to read, one per source
    *
    * @return The set of columns along with metadata
@@ -80,113 +73,142 @@ class reader::impl {
   table_with_metadata read(size_type skip_rows,
                            size_type num_rows,
                            bool uses_custom_row_bounds,
-                           std::vector<std::vector<size_type>> const& row_group_indices);
+                           host_span<std::vector<size_type> const> row_group_indices);
 
- private:
   /**
-   * @brief Reads compressed page data to device memory
+   * @brief Constructor from a chunk read limit and an array of dataset sources with reader options.
+   *
+   * By using this constructor, the reader will support iterative (chunked) reading through
+   * `has_next() ` and `read_chunk()`. For example:
+   * ```
+   *  do {
+   *    auto const chunk = reader.read_chunk();
+   *    // Process chunk
+   *  } while (reader.has_next());
+   *
+   * ```
    *
-   * @param page_data Buffers to hold compressed page data for each chunk
-   * @param chunks List of column chunk descriptors
-   * @param begin_chunk Index of first column chunk to read
-   * @param end_chunk Index after the last column chunk to read
-   * @param column_chunk_offsets File offset for all chunks
+   * Reading the whole given file at once through `read()` function is still supported if
+   * `chunk_read_limit == 0` (i.e., no reading limit).
+   * In such case, `read_chunk()` will also return rows of the entire file.
    *
+   * @param chunk_read_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
+   * @param sources Dataset sources
+   * @param options Settings for controlling reading behavior
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   * @param mr Device memory resource to use for device memory allocation
+   */
+  explicit impl(std::size_t chunk_read_limit,
+                std::vector<std::unique_ptr<datasource>>&& sources,
+                parquet_reader_options const& options,
+                rmm::cuda_stream_view stream,
+                rmm::mr::device_memory_resource* mr);
+
+  /**
+   * @copydoc cudf::io::chunked_parquet_reader::has_next
    */
-  std::future<void> read_column_chunks(std::vector<std::unique_ptr<datasource::buffer>>& page_data,
-                                       hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                       size_t begin_chunk,
-                                       size_t end_chunk,
-                                       const std::vector<size_t>& column_chunk_offsets,
-                                       std::vector<size_type> const& chunk_source_map);
+  bool has_next();
 
   /**
-   * @brief Returns the number of total pages from the given column chunks
-   *
-   * @param chunks List of column chunk descriptors
-   *
-   * @return The total number of pages
+   * @copydoc cudf::io::chunked_parquet_reader::read_chunk
    */
-  size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks);
+  table_with_metadata read_chunk();
 
+ private:
   /**
-   * @brief Returns the page information from the given column chunks.
+   * @brief Perform the necessary data preprocessing for parsing file later on.
    *
-   * @param chunks List of column chunk descriptors
-   * @param pages List of page information
+   * @param skip_rows Number of rows to skip from the start
+   * @param num_rows Number of rows to read, or `-1` to read all rows
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
+   * @param row_group_indices Lists of row groups to read (one per source), or empty if read all
    */
-  void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                           hostdevice_vector<gpu::PageInfo>& pages);
+  void prepare_data(size_type skip_rows,
+                    size_type num_rows,
+                    bool uses_custom_row_bounds,
+                    host_span<std::vector<size_type> const> row_group_indices);
 
   /**
-   * @brief Decompresses the page data, at page granularity.
+   * @brief Load and decompress the input file(s) into memory.
+   */
+  void load_and_decompress_data(std::vector<row_group_info> const& row_groups_info,
+                                size_type num_rows);
+
+  /**
+   * @brief Perform some preprocessing for page data and also compute the split locations
+   * {skip_rows, num_rows} for chunked reading.
+   *
+   * There are several pieces of information we can't compute directly from row counts in
+   * the parquet headers when dealing with nested schemas:
+   * - The total sizes of all output columns at all nesting levels
+   * - The starting output buffer offset for each page, for each nesting level
    *
-   * @param chunks List of column chunk descriptors
-   * @param pages List of page information
+   * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders).
    *
-   * @return Device buffer to decompressed page data
+   * @param skip_rows Crop all rows below skip_rows
+   * @param num_rows Maximum number of rows to read
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
+   * @param chunk_read_limit Limit on total number of bytes to be returned per read,
+   *        or `0` if there is no limit
    */
-  rmm::device_buffer decompress_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                                          hostdevice_vector<gpu::PageInfo>& pages);
+  void preprocess_pages(size_t skip_rows,
+                        size_t num_rows,
+                        bool uses_custom_row_bounds,
+                        size_t chunk_read_limit);
 
   /**
-   * @brief Allocate nesting information storage for all pages and set pointers
-   *        to it.
+   * @brief Allocate nesting information storage for all pages and set pointers to it.
    *
    * One large contiguous buffer of PageNestingInfo structs is allocated and
    * distributed among the PageInfo structs.
    *
    * Note that this gets called even in the flat schema case so that we have a
    * consistent place to store common information such as value counts, etc.
-   *
-   * @param chunks List of column chunk descriptors
-   * @param pages List of page information
-   * @param page_nesting_info The allocated nesting info structs.
    */
-  void allocate_nesting_info(hostdevice_vector<gpu::ColumnChunkDesc> const& chunks,
-                             hostdevice_vector<gpu::PageInfo>& pages,
-                             hostdevice_vector<gpu::PageNestingInfo>& page_nesting_info);
+  void allocate_nesting_info();
 
   /**
-   * @brief Preprocess column information for nested schemas.
+   * @brief Read a chunk of data and return an output table.
    *
-   * There are several pieces of information we can't compute directly from row counts in
-   * the parquet headers when dealing with nested schemas.
-   * - The total sizes of all output columns at all nesting levels
-   * - The starting output buffer offset for each page, for each nesting level
+   * This function is called internally and expects all preprocessing steps have already been done.
    *
-   * For flat schemas, these values are computed during header decoding (see gpuDecodePageHeaders)
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
+   * @return The output table along with columns' metadata
+   */
+  table_with_metadata read_chunk_internal(bool uses_custom_row_bounds);
+
+  /**
+   * @brief Finalize the output table by adding empty columns for the non-selected columns in
+   * schema.
+   *
+   * @param out_metadata The output table metadata
+   * @param out_columns The columns for building the output table
+   * @return The output table along with columns' metadata
+   */
+  table_with_metadata finalize_output(table_metadata& out_metadata,
+                                      std::vector<std::unique_ptr<column>>& out_columns);
+
+  /**
+   * @brief Allocate data bufers for the output columns.
    *
-   * @param chunks All chunks to be decoded
-   * @param pages All pages to be decoded
-   * @param min_rows crop all rows below min_row
-   * @param total_rows Maximum number of rows to read
-   * @param uses_custom_row_bounds Whether or not num_rows and min_rows represents user-specific
-   * bounds
-   * @param has_lists Whether or not this data contains lists and requires
-   * a preprocess.
+   * @param skip_rows Crop all rows below skip_rows
+   * @param num_rows Maximum number of rows to read
+   * @param uses_custom_row_bounds Whether or not num_rows and skip_rows represents user-specific
+   *        bounds
    */
-  void preprocess_columns(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                          hostdevice_vector<gpu::PageInfo>& pages,
-                          size_t min_row,
-                          size_t total_rows,
-                          bool uses_custom_row_bounds,
-                          bool has_lists);
+  void allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds);
 
   /**
    * @brief Converts the page data and outputs to columns.
    *
-   * @param chunks List of column chunk descriptors
-   * @param pages List of page information
-   * @param page_nesting Page nesting array
-   * @param min_row Minimum number of rows from start
-   * @param total_rows Number of rows to output
+   * @param skip_rows Minimum number of rows from start
+   * @param num_rows Number of rows to output
    */
-  void decode_page_data(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
-                        hostdevice_vector<gpu::PageInfo>& pages,
-                        hostdevice_vector<gpu::PageNestingInfo>& page_nesting,
-                        size_t min_row,
-                        size_t total_rows);
+  void decode_page_data(size_t skip_rows, size_t num_rows);
 
  private:
   rmm::cuda_stream_view _stream;
@@ -197,17 +219,30 @@ class reader::impl {
 
   // input columns to be processed
   std::vector<input_column_info> _input_columns;
-  // output columns to be generated
-  std::vector<column_buffer> _output_columns;
-  // _output_columns associated schema indices
+
+  // Buffers for generating output columns
+  std::vector<column_buffer> _output_buffers;
+
+  // Buffers copied from `_output_buffers` after construction for reuse
+  std::vector<column_buffer> _output_buffers_template;
+
+  // _output_buffers associated schema indices
   std::vector<int> _output_column_schemas;
 
+  // _output_buffers associated metadata
+  std::unique_ptr<table_metadata> _output_metadata;
+
   bool _strings_to_categorical = false;
   std::optional<std::vector<reader_column_schema>> _reader_column_schema;
   data_type _timestamp_type{type_id::EMPTY};
+
+  // Variables used for chunked reading:
+  cudf::io::parquet::gpu::file_intermediate_data _file_itm_data;
+  cudf::io::parquet::gpu::chunk_intermediate_data _chunk_itm_data;
+  std::vector<cudf::io::parquet::gpu::chunk_read_info> _chunk_read_info;
+  std::size_t _chunk_read_limit{0};
+  std::size_t _current_read_chunk{0};
+  bool _file_preprocessed{false};
 };
 
-}  // namespace parquet
-}  // namespace detail
-}  // namespace io
-}  // namespace cudf
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl_helpers.cpp b/cpp/src/io/parquet/reader_impl_helpers.cpp
new file mode 100644
index 00000000000..7090df2cae0
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_helpers.cpp
@@ -0,0 +1,629 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl_helpers.hpp"
+
+#include <numeric>
+#include <regex>
+
+namespace cudf::io::detail::parquet {
+
+namespace {
+
+ConvertedType logical_type_to_converted_type(LogicalType const& logical)
+{
+  if (logical.isset.STRING) {
+    return parquet::UTF8;
+  } else if (logical.isset.MAP) {
+    return parquet::MAP;
+  } else if (logical.isset.LIST) {
+    return parquet::LIST;
+  } else if (logical.isset.ENUM) {
+    return parquet::ENUM;
+  } else if (logical.isset.DECIMAL) {
+    return parquet::DECIMAL;  // TODO set decimal values
+  } else if (logical.isset.DATE) {
+    return parquet::DATE;
+  } else if (logical.isset.TIME) {
+    if (logical.TIME.unit.isset.MILLIS)
+      return parquet::TIME_MILLIS;
+    else if (logical.TIME.unit.isset.MICROS)
+      return parquet::TIME_MICROS;
+  } else if (logical.isset.TIMESTAMP) {
+    if (logical.TIMESTAMP.unit.isset.MILLIS)
+      return parquet::TIMESTAMP_MILLIS;
+    else if (logical.TIMESTAMP.unit.isset.MICROS)
+      return parquet::TIMESTAMP_MICROS;
+  } else if (logical.isset.INTEGER) {
+    switch (logical.INTEGER.bitWidth) {
+      case 8: return logical.INTEGER.isSigned ? INT_8 : UINT_8;
+      case 16: return logical.INTEGER.isSigned ? INT_16 : UINT_16;
+      case 32: return logical.INTEGER.isSigned ? INT_32 : UINT_32;
+      case 64: return logical.INTEGER.isSigned ? INT_64 : UINT_64;
+      default: break;
+    }
+  } else if (logical.isset.UNKNOWN) {
+    return parquet::NA;
+  } else if (logical.isset.JSON) {
+    return parquet::JSON;
+  } else if (logical.isset.BSON) {
+    return parquet::BSON;
+  }
+  return parquet::UNKNOWN;
+}
+
+}  // namespace
+
+/**
+ * @brief Function that translates Parquet datatype to cuDF type enum
+ */
+type_id to_type_id(SchemaElement const& schema,
+                   bool strings_to_categorical,
+                   type_id timestamp_type_id)
+{
+  parquet::Type const physical            = schema.type;
+  parquet::LogicalType const logical_type = schema.logical_type;
+  parquet::ConvertedType converted_type   = schema.converted_type;
+  int32_t decimal_scale                   = schema.decimal_scale;
+
+  // Logical type used for actual data interpretation; the legacy converted type
+  // is superceded by 'logical' type whenever available.
+  auto const inferred_converted_type = logical_type_to_converted_type(logical_type);
+  if (inferred_converted_type != parquet::UNKNOWN) converted_type = inferred_converted_type;
+  if (inferred_converted_type == parquet::DECIMAL && decimal_scale == 0)
+    decimal_scale = schema.logical_type.DECIMAL.scale;
+
+  switch (converted_type) {
+    case parquet::UINT_8: return type_id::UINT8;
+    case parquet::INT_8: return type_id::INT8;
+    case parquet::UINT_16: return type_id::UINT16;
+    case parquet::INT_16: return type_id::INT16;
+    case parquet::UINT_32: return type_id::UINT32;
+    case parquet::UINT_64: return type_id::UINT64;
+    case parquet::DATE: return type_id::TIMESTAMP_DAYS;
+    case parquet::TIME_MILLIS: return type_id::DURATION_MILLISECONDS;
+    case parquet::TIME_MICROS: return type_id::DURATION_MICROSECONDS;
+    case parquet::TIMESTAMP_MILLIS:
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_MILLISECONDS;
+    case parquet::TIMESTAMP_MICROS:
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_MICROSECONDS;
+    case parquet::DECIMAL:
+      if (physical == parquet::INT32) { return type_id::DECIMAL32; }
+      if (physical == parquet::INT64) { return type_id::DECIMAL64; }
+      if (physical == parquet::FIXED_LEN_BYTE_ARRAY) {
+        if (schema.type_length <= static_cast<int32_t>(sizeof(int32_t))) {
+          return type_id::DECIMAL32;
+        }
+        if (schema.type_length <= static_cast<int32_t>(sizeof(int64_t))) {
+          return type_id::DECIMAL64;
+        }
+        if (schema.type_length <= static_cast<int32_t>(sizeof(__int128_t))) {
+          return type_id::DECIMAL128;
+        }
+      }
+      CUDF_FAIL("Invalid representation of decimal type");
+      break;
+
+    // maps are just List<Struct<>>.
+    case parquet::MAP:
+    case parquet::LIST: return type_id::LIST;
+    case parquet::NA: return type_id::STRING;
+    // return type_id::EMPTY; //TODO(kn): enable after Null/Empty column support
+    default: break;
+  }
+
+  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+      logical_type.TIMESTAMP.unit.isset.NANOS) {
+    return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                 : type_id::TIMESTAMP_NANOSECONDS;
+  }
+
+  if (inferred_converted_type == parquet::UNKNOWN and physical == parquet::INT64 and
+      logical_type.TIME.unit.isset.NANOS) {
+    return type_id::DURATION_NANOSECONDS;
+  }
+
+  // is it simply a struct?
+  if (schema.is_struct()) { return type_id::STRUCT; }
+
+  // Physical storage type supported by Parquet; controls the on-disk storage
+  // format in combination with the encoding type.
+  switch (physical) {
+    case parquet::BOOLEAN: return type_id::BOOL8;
+    case parquet::INT32: return type_id::INT32;
+    case parquet::INT64: return type_id::INT64;
+    case parquet::FLOAT: return type_id::FLOAT32;
+    case parquet::DOUBLE: return type_id::FLOAT64;
+    case parquet::BYTE_ARRAY:
+    case parquet::FIXED_LEN_BYTE_ARRAY:
+      // Can be mapped to INT32 (32-bit hash) or STRING
+      return strings_to_categorical ? type_id::INT32 : type_id::STRING;
+    case parquet::INT96:
+      return (timestamp_type_id != type_id::EMPTY) ? timestamp_type_id
+                                                   : type_id::TIMESTAMP_NANOSECONDS;
+    default: break;
+  }
+
+  return type_id::EMPTY;
+}
+
+metadata::metadata(datasource* source)
+{
+  constexpr auto header_len = sizeof(file_header_s);
+  constexpr auto ender_len  = sizeof(file_ender_s);
+
+  const auto len           = source->size();
+  const auto header_buffer = source->host_read(0, header_len);
+  const auto header        = reinterpret_cast<const file_header_s*>(header_buffer->data());
+  const auto ender_buffer  = source->host_read(len - ender_len, ender_len);
+  const auto ender         = reinterpret_cast<const file_ender_s*>(ender_buffer->data());
+  CUDF_EXPECTS(len > header_len + ender_len, "Incorrect data source");
+  CUDF_EXPECTS(header->magic == parquet_magic && ender->magic == parquet_magic,
+               "Corrupted header or footer");
+  CUDF_EXPECTS(ender->footer_len != 0 && ender->footer_len <= (len - header_len - ender_len),
+               "Incorrect footer length");
+
+  const auto buffer = source->host_read(len - ender->footer_len - ender_len, ender->footer_len);
+  CompactProtocolReader cp(buffer->data(), ender->footer_len);
+  CUDF_EXPECTS(cp.read(this), "Cannot parse metadata");
+  CUDF_EXPECTS(cp.InitSchema(this), "Cannot initialize schema");
+}
+
+std::vector<metadata> aggregate_reader_metadata::metadatas_from_sources(
+  std::vector<std::unique_ptr<datasource>> const& sources)
+{
+  std::vector<metadata> metadatas;
+  std::transform(
+    sources.cbegin(), sources.cend(), std::back_inserter(metadatas), [](auto const& source) {
+      return metadata(source.get());
+    });
+  return metadatas;
+}
+
+std::vector<std::unordered_map<std::string, std::string>>
+aggregate_reader_metadata::collect_keyval_metadata() const
+{
+  std::vector<std::unordered_map<std::string, std::string>> kv_maps;
+  std::transform(per_file_metadata.cbegin(),
+                 per_file_metadata.cend(),
+                 std::back_inserter(kv_maps),
+                 [](auto const& pfm) {
+                   std::unordered_map<std::string, std::string> kv_map;
+                   std::transform(pfm.key_value_metadata.cbegin(),
+                                  pfm.key_value_metadata.cend(),
+                                  std::inserter(kv_map, kv_map.end()),
+                                  [](auto const& kv) {
+                                    return std::pair{kv.key, kv.value};
+                                  });
+                   return kv_map;
+                 });
+
+  return kv_maps;
+}
+
+size_type aggregate_reader_metadata::calc_num_rows() const
+{
+  return std::accumulate(
+    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
+      return sum + pfm.num_rows;
+    });
+}
+
+size_type aggregate_reader_metadata::calc_num_row_groups() const
+{
+  return std::accumulate(
+    per_file_metadata.begin(), per_file_metadata.end(), 0, [](auto& sum, auto& pfm) {
+      return sum + pfm.row_groups.size();
+    });
+}
+
+aggregate_reader_metadata::aggregate_reader_metadata(
+  std::vector<std::unique_ptr<datasource>> const& sources)
+  : per_file_metadata(metadatas_from_sources(sources)),
+    keyval_maps(collect_keyval_metadata()),
+    num_rows(calc_num_rows()),
+    num_row_groups(calc_num_row_groups())
+{
+  if (per_file_metadata.size() > 0) {
+    auto const& first_meta = per_file_metadata.front();
+    auto const num_cols =
+      first_meta.row_groups.size() > 0 ? first_meta.row_groups.front().columns.size() : 0;
+    auto const& schema = first_meta.schema;
+
+    // Verify that the input files have matching numbers of columns and schema.
+    for (auto const& pfm : per_file_metadata) {
+      if (pfm.row_groups.size() > 0) {
+        CUDF_EXPECTS(num_cols == pfm.row_groups.front().columns.size(),
+                     "All sources must have the same number of columns");
+      }
+      CUDF_EXPECTS(schema == pfm.schema, "All sources must have the same schema");
+    }
+  }
+}
+
+RowGroup const& aggregate_reader_metadata::get_row_group(size_type row_group_index,
+                                                         size_type src_idx) const
+{
+  CUDF_EXPECTS(src_idx >= 0 && src_idx < static_cast<size_type>(per_file_metadata.size()),
+               "invalid source index");
+  return per_file_metadata[src_idx].row_groups[row_group_index];
+}
+
+ColumnChunkMetaData const& aggregate_reader_metadata::get_column_metadata(size_type row_group_index,
+                                                                          size_type src_idx,
+                                                                          int schema_idx) const
+{
+  auto col = std::find_if(
+    per_file_metadata[src_idx].row_groups[row_group_index].columns.begin(),
+    per_file_metadata[src_idx].row_groups[row_group_index].columns.end(),
+    [schema_idx](ColumnChunk const& col) { return col.schema_idx == schema_idx ? true : false; });
+  CUDF_EXPECTS(col != std::end(per_file_metadata[src_idx].row_groups[row_group_index].columns),
+               "Found no metadata for schema index");
+  return col->meta_data;
+}
+
+std::string aggregate_reader_metadata::get_pandas_index() const
+{
+  // Assumes that all input files have the same metadata
+  // TODO: verify this assumption
+  auto it = keyval_maps[0].find("pandas");
+  if (it != keyval_maps[0].end()) {
+    // Captures a list of quoted strings found inside square brackets after `"index_columns":`
+    // Inside quotes supports newlines, brackets, escaped quotes, etc.
+    // One-liner regex:
+    // "index_columns"\s*:\s*\[\s*((?:"(?:|(?:.*?(?![^\\]")).?)[^\\]?",?\s*)*)\]
+    // Documented below.
+    std::regex index_columns_expr{
+      R"("index_columns"\s*:\s*\[\s*)"  // match preamble, opening square bracket, whitespace
+      R"(()"                            // Open first capturing group
+      R"((?:")"                         // Open non-capturing group match opening quote
+      R"((?:|(?:.*?(?![^\\]")).?))"     // match empty string or anything between quotes
+      R"([^\\]?")"                      // Match closing non-escaped quote
+      R"(,?\s*)"                        // Match optional comma and whitespace
+      R"()*)"                           // Close non-capturing group and repeat 0 or more times
+      R"())"                            // Close first capturing group
+      R"(\])"                           // Match closing square brackets
+    };
+    std::smatch sm;
+    if (std::regex_search(it->second, sm, index_columns_expr)) { return sm[1].str(); }
+  }
+  return "";
+}
+
+std::vector<std::string> aggregate_reader_metadata::get_pandas_index_names() const
+{
+  std::vector<std::string> names;
+  auto str = get_pandas_index();
+  if (str.length() != 0) {
+    std::regex index_name_expr{R"(\"((?:\\.|[^\"])*)\")"};
+    std::smatch sm;
+    while (std::regex_search(str, sm, index_name_expr)) {
+      if (sm.size() == 2) {  // 2 = whole match, first item
+        if (std::find(names.begin(), names.end(), sm[1].str()) == names.end()) {
+          std::regex esc_quote{R"(\\")"};
+          names.emplace_back(std::regex_replace(sm[1].str(), esc_quote, R"(")"));
+        }
+      }
+      str = sm.suffix();
+    }
+  }
+  return names;
+}
+
+std::tuple<size_type, size_type, std::vector<row_group_info>>
+aggregate_reader_metadata::select_row_groups(
+  host_span<std::vector<size_type> const> row_group_indices,
+  size_type row_start,
+  size_type row_count) const
+{
+  std::vector<row_group_info> selection;
+
+  if (!row_group_indices.empty()) {
+    CUDF_EXPECTS(row_group_indices.size() == per_file_metadata.size(),
+                 "Must specify row groups for each source");
+
+    row_count = 0;
+    for (size_t src_idx = 0; src_idx < row_group_indices.size(); ++src_idx) {
+      for (auto const& rowgroup_idx : row_group_indices[src_idx]) {
+        CUDF_EXPECTS(
+          rowgroup_idx >= 0 &&
+            rowgroup_idx < static_cast<size_type>(per_file_metadata[src_idx].row_groups.size()),
+          "Invalid rowgroup index");
+        selection.emplace_back(rowgroup_idx, row_count, src_idx);
+        row_count += get_row_group(rowgroup_idx, src_idx).num_rows;
+      }
+    }
+
+    return {row_start, row_count, std::move(selection)};
+  }
+
+  row_start = std::max(row_start, 0);
+  if (row_count < 0) {
+    row_count = std::min(get_num_rows(), std::numeric_limits<size_type>::max());
+  }
+  row_count = std::min(row_count, get_num_rows() - row_start);
+  CUDF_EXPECTS(row_count >= 0, "Invalid row count");
+  CUDF_EXPECTS(row_start <= get_num_rows(), "Invalid row start");
+
+  size_type count = 0;
+  for (size_t src_idx = 0; src_idx < per_file_metadata.size(); ++src_idx) {
+    for (size_t rg_idx = 0; rg_idx < per_file_metadata[src_idx].row_groups.size(); ++rg_idx) {
+      auto const chunk_start_row = count;
+      count += get_row_group(rg_idx, src_idx).num_rows;
+      if (count > row_start || count == 0) {
+        selection.emplace_back(rg_idx, chunk_start_row, src_idx);
+      }
+      if (count >= row_start + row_count) { break; }
+    }
+  }
+
+  return {row_start, row_count, std::move(selection)};
+}
+
+std::tuple<std::vector<input_column_info>, std::vector<column_buffer>, std::vector<size_type>>
+aggregate_reader_metadata::select_columns(std::optional<std::vector<std::string>> const& use_names,
+                                          bool include_index,
+                                          bool strings_to_categorical,
+                                          type_id timestamp_type_id) const
+{
+  auto find_schema_child = [&](SchemaElement const& schema_elem, std::string const& name) {
+    auto const& col_schema_idx =
+      std::find_if(schema_elem.children_idx.cbegin(),
+                   schema_elem.children_idx.cend(),
+                   [&](size_t col_schema_idx) { return get_schema(col_schema_idx).name == name; });
+
+    return (col_schema_idx != schema_elem.children_idx.end())
+             ? static_cast<size_type>(*col_schema_idx)
+             : -1;
+  };
+
+  std::vector<column_buffer> output_columns;
+  std::vector<input_column_info> input_columns;
+  std::vector<int> nesting;
+
+  // Return true if column path is valid. e.g. if the path is {"struct1", "child1"}, then it is
+  // valid if "struct1.child1" exists in this file's schema. If "struct1" exists but "child1" is
+  // not a child of "struct1" then the function will return false for "struct1"
+  std::function<bool(column_name_info const*, int, std::vector<column_buffer>&, bool)>
+    build_column = [&](column_name_info const* col_name_info,
+                       int schema_idx,
+                       std::vector<column_buffer>& out_col_array,
+                       bool has_list_parent) {
+      if (schema_idx < 0) { return false; }
+      auto const& schema_elem = get_schema(schema_idx);
+
+      // if schema_elem is a stub then it does not exist in the column_name_info and column_buffer
+      // hierarchy. So continue on
+      if (schema_elem.is_stub()) {
+        // is this legit?
+        CUDF_EXPECTS(schema_elem.num_children == 1, "Unexpected number of children for stub");
+        auto child_col_name_info = (col_name_info) ? &col_name_info->children[0] : nullptr;
+        return build_column(
+          child_col_name_info, schema_elem.children_idx[0], out_col_array, has_list_parent);
+      }
+
+      // if we're at the root, this is a new output column
+      auto const col_type = schema_elem.is_one_level_list()
+                              ? type_id::LIST
+                              : to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
+      auto const dtype    = to_data_type(col_type, schema_elem);
+
+      column_buffer output_col(dtype, schema_elem.repetition_type == OPTIONAL);
+      if (has_list_parent) { output_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT; }
+      // store the index of this element if inserted in out_col_array
+      nesting.push_back(static_cast<int>(out_col_array.size()));
+      output_col.name = schema_elem.name;
+
+      // build each child
+      bool path_is_valid = false;
+      if (col_name_info == nullptr or col_name_info->children.empty()) {
+        // add all children of schema_elem.
+        // At this point, we can no longer pass a col_name_info to build_column
+        for (int idx = 0; idx < schema_elem.num_children; idx++) {
+          path_is_valid |= build_column(nullptr,
+                                        schema_elem.children_idx[idx],
+                                        output_col.children,
+                                        has_list_parent || col_type == type_id::LIST);
+        }
+      } else {
+        for (size_t idx = 0; idx < col_name_info->children.size(); idx++) {
+          path_is_valid |=
+            build_column(&col_name_info->children[idx],
+                         find_schema_child(schema_elem, col_name_info->children[idx].name),
+                         output_col.children,
+                         has_list_parent || col_type == type_id::LIST);
+        }
+      }
+
+      // if I have no children, we're at a leaf and I'm an input column (that is, one with actual
+      // data stored) so add me to the list.
+      if (schema_elem.num_children == 0) {
+        input_column_info& input_col = input_columns.emplace_back(
+          input_column_info{schema_idx, schema_elem.name, schema_elem.max_repetition_level > 0});
+
+        // set up child output column for one-level encoding list
+        if (schema_elem.is_one_level_list()) {
+          // determine the element data type
+          auto const element_type =
+            to_type_id(schema_elem, strings_to_categorical, timestamp_type_id);
+          auto const element_dtype = to_data_type(element_type, schema_elem);
+
+          column_buffer element_col(element_dtype, schema_elem.repetition_type == OPTIONAL);
+          if (has_list_parent || col_type == type_id::LIST) {
+            element_col.user_data |= PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT;
+          }
+          // store the index of this element
+          nesting.push_back(static_cast<int>(output_col.children.size()));
+          // TODO: not sure if we should assign a name or leave it blank
+          element_col.name = "element";
+
+          output_col.children.push_back(std::move(element_col));
+        }
+
+        std::copy(nesting.cbegin(), nesting.cend(), std::back_inserter(input_col.nesting));
+
+        // pop off the extra nesting element.
+        if (schema_elem.is_one_level_list()) { nesting.pop_back(); }
+
+        path_is_valid = true;  // If we're able to reach leaf then path is valid
+      }
+
+      if (path_is_valid) { out_col_array.push_back(std::move(output_col)); }
+
+      nesting.pop_back();
+      return path_is_valid;
+    };
+
+  std::vector<int> output_column_schemas;
+
+  //
+  // there is not necessarily a 1:1 mapping between input columns and output columns.
+  // For example, parquet does not explicitly store a ColumnChunkDesc for struct columns.
+  // The "structiness" is simply implied by the schema.  For example, this schema:
+  //  required group field_id=1 name {
+  //    required binary field_id=2 firstname (String);
+  //    required binary field_id=3 middlename (String);
+  //    required binary field_id=4 lastname (String);
+  // }
+  // will only contain 3 internal columns of data (firstname, middlename, lastname).  But of
+  // course "name" is ultimately the struct column we want to return.
+  //
+  // "firstname", "middlename" and "lastname" represent the input columns in the file that we
+  // process to produce the final cudf "name" column.
+  //
+  // A user can ask for a single field out of the struct e.g. firstname.
+  // In this case they'll pass a fully qualified name to the schema element like
+  // ["name", "firstname"]
+  //
+  auto const& root = get_schema(0);
+  if (not use_names.has_value()) {
+    for (auto const& schema_idx : root.children_idx) {
+      build_column(nullptr, schema_idx, output_columns, false);
+      output_column_schemas.push_back(schema_idx);
+    }
+  } else {
+    struct path_info {
+      std::string full_path;
+      int schema_idx;
+    };
+
+    // Convert schema into a vector of every possible path
+    std::vector<path_info> all_paths;
+    std::function<void(std::string, int)> add_path = [&](std::string path_till_now,
+                                                         int schema_idx) {
+      auto const& schema_elem = get_schema(schema_idx);
+      std::string curr_path   = path_till_now + schema_elem.name;
+      all_paths.push_back({curr_path, schema_idx});
+      for (auto const& child_idx : schema_elem.children_idx) {
+        add_path(curr_path + ".", child_idx);
+      }
+    };
+    for (auto const& child_idx : get_schema(0).children_idx) {
+      add_path("", child_idx);
+    }
+
+    // Find which of the selected paths are valid and get their schema index
+    std::vector<path_info> valid_selected_paths;
+    for (auto const& selected_path : *use_names) {
+      auto found_path =
+        std::find_if(all_paths.begin(), all_paths.end(), [&](path_info& valid_path) {
+          return valid_path.full_path == selected_path;
+        });
+      if (found_path != all_paths.end()) {
+        valid_selected_paths.push_back({selected_path, found_path->schema_idx});
+      }
+    }
+
+    // Now construct paths as vector of strings for further consumption
+    std::vector<std::vector<std::string>> use_names3;
+    std::transform(valid_selected_paths.begin(),
+                   valid_selected_paths.end(),
+                   std::back_inserter(use_names3),
+                   [&](path_info const& valid_path) {
+                     auto schema_idx = valid_path.schema_idx;
+                     std::vector<std::string> result_path;
+                     do {
+                       SchemaElement const& elem = get_schema(schema_idx);
+                       result_path.push_back(elem.name);
+                       schema_idx = elem.parent_idx;
+                     } while (schema_idx > 0);
+                     return std::vector<std::string>(result_path.rbegin(), result_path.rend());
+                   });
+
+    std::vector<column_name_info> selected_columns;
+    if (include_index) {
+      std::vector<std::string> index_names = get_pandas_index_names();
+      std::transform(index_names.cbegin(),
+                     index_names.cend(),
+                     std::back_inserter(selected_columns),
+                     [](std::string const& name) { return column_name_info(name); });
+    }
+    // Merge the vector use_names into a set of hierarchical column_name_info objects
+    /* This is because if we have columns like this:
+     *     col1
+     *      / \
+     *    s3   f4
+     *   / \
+     * f5   f6
+     *
+     * there may be common paths in use_names like:
+     * {"col1", "s3", "f5"}, {"col1", "f4"}
+     * which means we want the output to contain
+     *     col1
+     *      / \
+     *    s3   f4
+     *   /
+     * f5
+     *
+     * rather than
+     *  col1   col1
+     *   |      |
+     *   s3     f4
+     *   |
+     *   f5
+     */
+    for (auto const& path : use_names3) {
+      auto array_to_find_in = &selected_columns;
+      for (size_t depth = 0; depth < path.size(); ++depth) {
+        // Check if the path exists in our selected_columns and if not, add it.
+        auto const& name_to_find = path[depth];
+        auto found_col           = std::find_if(
+          array_to_find_in->begin(),
+          array_to_find_in->end(),
+          [&name_to_find](column_name_info const& col) { return col.name == name_to_find; });
+        if (found_col == array_to_find_in->end()) {
+          auto& col        = array_to_find_in->emplace_back(name_to_find);
+          array_to_find_in = &col.children;
+        } else {
+          // Path exists. go down further.
+          array_to_find_in = &found_col->children;
+        }
+      }
+    }
+    for (auto& col : selected_columns) {
+      auto const& top_level_col_schema_idx = find_schema_child(root, col.name);
+      bool valid_column = build_column(&col, top_level_col_schema_idx, output_columns, false);
+      if (valid_column) output_column_schemas.push_back(top_level_col_schema_idx);
+    }
+  }
+
+  return std::make_tuple(
+    std::move(input_columns), std::move(output_columns), std::move(output_column_schemas));
+}
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl_helpers.hpp b/cpp/src/io/parquet/reader_impl_helpers.hpp
new file mode 100644
index 00000000000..6fa86a77e46
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_helpers.hpp
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "compact_protocol_reader.hpp"
+#include "parquet_gpu.hpp"
+
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/types.hpp>
+
+#include <tuple>
+#include <vector>
+
+namespace cudf::io::detail::parquet {
+
+using namespace cudf::io::parquet;
+
+/**
+ * @brief Function that translates Parquet datatype to cuDF type enum
+ */
+[[nodiscard]] type_id to_type_id(SchemaElement const& schema,
+                                 bool strings_to_categorical,
+                                 type_id timestamp_type_id);
+
+/**
+ * @brief Converts cuDF type enum to column logical type
+ */
+[[nodiscard]] inline data_type to_data_type(type_id t_id, SchemaElement const& schema)
+{
+  return t_id == type_id::DECIMAL32 || t_id == type_id::DECIMAL64 || t_id == type_id::DECIMAL128
+           ? data_type{t_id, numeric::scale_type{-schema.decimal_scale}}
+           : data_type{t_id};
+}
+
+/**
+ * @brief The row_group_info class
+ */
+struct row_group_info {
+  size_type const index;
+  size_t const start_row;  // TODO source index
+  size_type const source_index;
+  row_group_info(size_type index, size_t start_row, size_type source_index)
+    : index(index), start_row(start_row), source_index(source_index)
+  {
+  }
+};
+
+/**
+ * @brief Class for parsing dataset metadata
+ */
+struct metadata : public FileMetaData {
+  explicit metadata(datasource* source);
+};
+
+class aggregate_reader_metadata {
+  std::vector<metadata> per_file_metadata;
+  std::vector<std::unordered_map<std::string, std::string>> keyval_maps;
+  size_type num_rows;
+  size_type num_row_groups;
+
+  /**
+   * @brief Create a metadata object from each element in the source vector
+   */
+  static std::vector<metadata> metadatas_from_sources(
+    std::vector<std::unique_ptr<datasource>> const& sources);
+
+  /**
+   * @brief Collect the keyvalue maps from each per-file metadata object into a vector of maps.
+   */
+  [[nodiscard]] std::vector<std::unordered_map<std::string, std::string>> collect_keyval_metadata()
+    const;
+
+  /**
+   * @brief Sums up the number of rows of each source
+   */
+  [[nodiscard]] size_type calc_num_rows() const;
+
+  /**
+   * @brief Sums up the number of row groups of each source
+   */
+  [[nodiscard]] size_type calc_num_row_groups() const;
+
+ public:
+  aggregate_reader_metadata(std::vector<std::unique_ptr<datasource>> const& sources);
+
+  [[nodiscard]] RowGroup const& get_row_group(size_type row_group_index, size_type src_idx) const;
+
+  [[nodiscard]] ColumnChunkMetaData const& get_column_metadata(size_type row_group_index,
+                                                               size_type src_idx,
+                                                               int schema_idx) const;
+
+  [[nodiscard]] auto get_num_rows() const { return num_rows; }
+
+  [[nodiscard]] auto get_num_row_groups() const { return num_row_groups; }
+
+  [[nodiscard]] auto const& get_schema(int schema_idx) const
+  {
+    return per_file_metadata[0].schema[schema_idx];
+  }
+
+  [[nodiscard]] auto const& get_key_value_metadata() const { return keyval_maps; }
+
+  /**
+   * @brief Gets the concrete nesting depth of output cudf columns
+   *
+   * @param schema_index Schema index of the input column
+   *
+   * @return comma-separated index column names in quotes
+   */
+  [[nodiscard]] inline int get_output_nesting_depth(int schema_index) const
+  {
+    auto& pfm = per_file_metadata[0];
+    int depth = 0;
+
+    // walk upwards, skipping repeated fields
+    while (schema_index > 0) {
+      if (!pfm.schema[schema_index].is_stub()) { depth++; }
+      // schema of one-level encoding list doesn't contain nesting information, so we need to
+      // manually add an extra nesting level
+      if (pfm.schema[schema_index].is_one_level_list()) { depth++; }
+      schema_index = pfm.schema[schema_index].parent_idx;
+    }
+    return depth;
+  }
+
+  /**
+   * @brief Extracts the pandas "index_columns" section
+   *
+   * PANDAS adds its own metadata to the key_value section when writing out the
+   * dataframe to a file to aid in exact reconstruction. The JSON-formatted
+   * metadata contains the index column(s) and PANDA-specific datatypes.
+   *
+   * @return comma-separated index column names in quotes
+   */
+  [[nodiscard]] std::string get_pandas_index() const;
+
+  /**
+   * @brief Extracts the column name(s) used for the row indexes in a dataframe
+   *
+   * @param names List of column names to load, where index column name(s) will be added
+   */
+  [[nodiscard]] std::vector<std::string> get_pandas_index_names() const;
+
+  /**
+   * @brief Filters and reduces down to a selection of row groups
+   *
+   * The input `row_start` and `row_count` parameters will be recomputed and output as the valid
+   * values based on the input row group list.
+   *
+   * @param row_group_indices Lists of row groups to read, one per source
+   * @param row_start Starting row of the selection
+   * @param row_count Total number of rows selected
+   *
+   * @return A tuple of corrected row_start, row_count and list of row group indexes and its
+   *         starting row
+   */
+  [[nodiscard]] std::tuple<size_type, size_type, std::vector<row_group_info>> select_row_groups(
+    host_span<std::vector<size_type> const> row_group_indices,
+    size_type row_start,
+    size_type row_count) const;
+
+  /**
+   * @brief Filters and reduces down to a selection of columns
+   *
+   * @param use_names List of paths of column names to select; `nullopt` if user did not select
+   * columns to read
+   * @param include_index Whether to always include the PANDAS index column(s)
+   * @param strings_to_categorical Type conversion parameter
+   * @param timestamp_type_id Type conversion parameter
+   *
+   * @return input column information, output column information, list of output column schema
+   * indices
+   */
+  [[nodiscard]] std::
+    tuple<std::vector<input_column_info>, std::vector<column_buffer>, std::vector<size_type>>
+    select_columns(std::optional<std::vector<std::string>> const& use_names,
+                   bool include_index,
+                   bool strings_to_categorical,
+                   type_id timestamp_type_id) const;
+};
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/reader_impl_preprocess.cu b/cpp/src/io/parquet/reader_impl_preprocess.cu
new file mode 100644
index 00000000000..38fce7d3263
--- /dev/null
+++ b/cpp/src/io/parquet/reader_impl_preprocess.cu
@@ -0,0 +1,1527 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "reader_impl.hpp"
+
+#include <io/comp/nvcomp_adapter.hpp>
+#include <io/utilities/config_utils.hpp>
+#include <io/utilities/time_utils.cuh>
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/utilities/integer_utils.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
+
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/binary_search.h>
+#include <thrust/fill.h>
+#include <thrust/functional.h>
+#include <thrust/iterator/constant_iterator.h>
+#include <thrust/iterator/discard_iterator.h>
+#include <thrust/iterator/iterator_categories.h>
+#include <thrust/iterator/transform_iterator.h>
+#include <thrust/logical.h>
+#include <thrust/reduce.h>
+#include <thrust/scan.h>
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+#include <thrust/transform.h>
+#include <thrust/unique.h>
+
+#include <numeric>
+
+namespace cudf::io::detail::parquet {
+namespace {
+
+/**
+ * @brief Generate depth remappings for repetition and definition levels.
+ *
+ * When dealing with columns that contain lists, we must examine incoming
+ * repetition and definition level pairs to determine what range of output nesting
+ * is indicated when adding new values.  This function generates the mappings of
+ * the R/D levels to those start/end bounds
+ *
+ * @param remap Maps column schema index to the R/D remapping vectors for that column
+ * @param src_col_schema The column schema to generate the new mapping for
+ * @param md File metadata information
+ */
+void generate_depth_remappings(std::map<int, std::pair<std::vector<int>, std::vector<int>>>& remap,
+                               int src_col_schema,
+                               aggregate_reader_metadata const& md)
+{
+  // already generated for this level
+  if (remap.find(src_col_schema) != remap.end()) { return; }
+  auto schema   = md.get_schema(src_col_schema);
+  int max_depth = md.get_output_nesting_depth(src_col_schema);
+
+  CUDF_EXPECTS(remap.find(src_col_schema) == remap.end(),
+               "Attempting to remap a schema more than once");
+  auto inserted =
+    remap.insert(std::pair<int, std::pair<std::vector<int>, std::vector<int>>>{src_col_schema, {}});
+  auto& depth_remap = inserted.first->second;
+
+  std::vector<int>& rep_depth_remap = (depth_remap.first);
+  rep_depth_remap.resize(schema.max_repetition_level + 1);
+  std::vector<int>& def_depth_remap = (depth_remap.second);
+  def_depth_remap.resize(schema.max_definition_level + 1);
+
+  // the key:
+  // for incoming level values  R/D
+  // add values starting at the shallowest nesting level X has repetition level R
+  // until you reach the deepest nesting level Y that corresponds to the repetition level R1
+  // held by the nesting level that has definition level D
+  //
+  // Example: a 3 level struct with a list at the bottom
+  //
+  //                     R / D   Depth
+  // level0              0 / 1     0
+  //   level1            0 / 2     1
+  //     level2          0 / 3     2
+  //       list          0 / 3     3
+  //         element     1 / 4     4
+  //
+  // incoming R/D : 0, 0  -> add values from depth 0 to 3   (def level 0 always maps to depth 0)
+  // incoming R/D : 0, 1  -> add values from depth 0 to 3
+  // incoming R/D : 0, 2  -> add values from depth 0 to 3
+  // incoming R/D : 1, 4  -> add values from depth 4 to 4
+  //
+  // Note : the -validity- of values is simply checked by comparing the incoming D value against the
+  // D value of the given nesting level (incoming D >= the D for the nesting level == valid,
+  // otherwise NULL).  The tricky part is determining what nesting levels to add values at.
+  //
+  // For schemas with no repetition level (no lists), X is always 0 and Y is always max nesting
+  // depth.
+  //
+
+  // compute "X" from above
+  for (int s_idx = schema.max_repetition_level; s_idx >= 0; s_idx--) {
+    auto find_shallowest = [&](int r) {
+      int shallowest = -1;
+      int cur_depth  = max_depth - 1;
+      int schema_idx = src_col_schema;
+      while (schema_idx > 0) {
+        auto cur_schema = md.get_schema(schema_idx);
+        if (cur_schema.max_repetition_level == r) {
+          // if this is a repeated field, map it one level deeper
+          shallowest = cur_schema.is_stub() ? cur_depth + 1 : cur_depth;
+        }
+        // if it's one-level encoding list
+        else if (cur_schema.is_one_level_list()) {
+          shallowest = cur_depth - 1;
+        }
+        if (!cur_schema.is_stub()) { cur_depth--; }
+        schema_idx = cur_schema.parent_idx;
+      }
+      return shallowest;
+    };
+    rep_depth_remap[s_idx] = find_shallowest(s_idx);
+  }
+
+  // compute "Y" from above
+  for (int s_idx = schema.max_definition_level; s_idx >= 0; s_idx--) {
+    auto find_deepest = [&](int d) {
+      SchemaElement prev_schema;
+      int schema_idx = src_col_schema;
+      int r1         = 0;
+      while (schema_idx > 0) {
+        SchemaElement cur_schema = md.get_schema(schema_idx);
+        if (cur_schema.max_definition_level == d) {
+          // if this is a repeated field, map it one level deeper
+          r1 = cur_schema.is_stub() ? prev_schema.max_repetition_level
+                                    : cur_schema.max_repetition_level;
+          break;
+        }
+        prev_schema = cur_schema;
+        schema_idx  = cur_schema.parent_idx;
+      }
+
+      // we now know R1 from above. return the deepest nesting level that has the
+      // same repetition level
+      schema_idx = src_col_schema;
+      int depth  = max_depth - 1;
+      while (schema_idx > 0) {
+        SchemaElement cur_schema = md.get_schema(schema_idx);
+        if (cur_schema.max_repetition_level == r1) {
+          // if this is a repeated field, map it one level deeper
+          depth = cur_schema.is_stub() ? depth + 1 : depth;
+          break;
+        }
+        if (!cur_schema.is_stub()) { depth--; }
+        prev_schema = cur_schema;
+        schema_idx  = cur_schema.parent_idx;
+      }
+      return depth;
+    };
+    def_depth_remap[s_idx] = find_deepest(s_idx);
+  }
+}
+
+/**
+ * @brief Return the required number of bits to store a value.
+ */
+template <typename T = uint8_t>
+[[nodiscard]] T required_bits(uint32_t max_level)
+{
+  return static_cast<T>(CompactProtocolReader::NumRequiredBits(max_level));
+}
+
+/**
+ * @brief Converts cuDF units to Parquet units.
+ *
+ * @return A tuple of Parquet type width, Parquet clock rate and Parquet decimal type.
+ */
+[[nodiscard]] std::tuple<int32_t, int32_t, int8_t> conversion_info(type_id column_type_id,
+                                                                   type_id timestamp_type_id,
+                                                                   parquet::Type physical,
+                                                                   int8_t converted,
+                                                                   int32_t length)
+{
+  int32_t type_width = (physical == parquet::FIXED_LEN_BYTE_ARRAY) ? length : 0;
+  int32_t clock_rate = 0;
+  if (column_type_id == type_id::INT8 or column_type_id == type_id::UINT8) {
+    type_width = 1;  // I32 -> I8
+  } else if (column_type_id == type_id::INT16 or column_type_id == type_id::UINT16) {
+    type_width = 2;  // I32 -> I16
+  } else if (column_type_id == type_id::INT32) {
+    type_width = 4;  // str -> hash32
+  } else if (is_chrono(data_type{column_type_id})) {
+    clock_rate = to_clockrate(timestamp_type_id);
+  }
+
+  int8_t converted_type = converted;
+  if (converted_type == parquet::DECIMAL && column_type_id != type_id::FLOAT64 &&
+      not cudf::is_fixed_point(data_type{column_type_id})) {
+    converted_type = parquet::UNKNOWN;  // Not converting to float64 or decimal
+  }
+  return std::make_tuple(type_width, clock_rate, converted_type);
+}
+
+/**
+ * @brief Reads compressed page data to device memory.
+ *
+ * @param sources Dataset sources
+ * @param page_data Buffers to hold compressed page data for each chunk
+ * @param chunks List of column chunk descriptors
+ * @param begin_chunk Index of first column chunk to read
+ * @param end_chunk Index after the last column chunk to read
+ * @param column_chunk_offsets File offset for all chunks
+ * @param chunk_source_map Association between each column chunk and its source
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return A future object for reading synchronization
+ */
+[[nodiscard]] std::future<void> read_column_chunks_async(
+  std::vector<std::unique_ptr<datasource>> const& sources,
+  std::vector<std::unique_ptr<datasource::buffer>>& page_data,
+  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  size_t begin_chunk,
+  size_t end_chunk,
+  const std::vector<size_t>& column_chunk_offsets,
+  std::vector<size_type> const& chunk_source_map,
+  rmm::cuda_stream_view stream)
+{
+  // Transfer chunk data, coalescing adjacent chunks
+  std::vector<std::future<size_t>> read_tasks;
+  for (size_t chunk = begin_chunk; chunk < end_chunk;) {
+    const size_t io_offset   = column_chunk_offsets[chunk];
+    size_t io_size           = chunks[chunk].compressed_size;
+    size_t next_chunk        = chunk + 1;
+    const bool is_compressed = (chunks[chunk].codec != parquet::Compression::UNCOMPRESSED);
+    while (next_chunk < end_chunk) {
+      const size_t next_offset = column_chunk_offsets[next_chunk];
+      const bool is_next_compressed =
+        (chunks[next_chunk].codec != parquet::Compression::UNCOMPRESSED);
+      if (next_offset != io_offset + io_size || is_next_compressed != is_compressed) {
+        // Can't merge if not contiguous or mixing compressed and uncompressed
+        // Not coalescing uncompressed with compressed chunks is so that compressed buffers can be
+        // freed earlier (immediately after decompression stage) to limit peak memory requirements
+        break;
+      }
+      io_size += chunks[next_chunk].compressed_size;
+      next_chunk++;
+    }
+    if (io_size != 0) {
+      auto& source = sources[chunk_source_map[chunk]];
+      if (source->is_device_read_preferred(io_size)) {
+        auto buffer        = rmm::device_buffer(io_size, stream);
+        auto fut_read_size = source->device_read_async(
+          io_offset, io_size, static_cast<uint8_t*>(buffer.data()), stream);
+        read_tasks.emplace_back(std::move(fut_read_size));
+        page_data[chunk] = datasource::buffer::create(std::move(buffer));
+      } else {
+        auto const buffer = source->host_read(io_offset, io_size);
+        page_data[chunk] =
+          datasource::buffer::create(rmm::device_buffer(buffer->data(), buffer->size(), stream));
+      }
+      auto d_compdata = page_data[chunk]->data();
+      do {
+        chunks[chunk].compressed_data = d_compdata;
+        d_compdata += chunks[chunk].compressed_size;
+      } while (++chunk != next_chunk);
+    } else {
+      chunk = next_chunk;
+    }
+  }
+  auto sync_fn = [](decltype(read_tasks) read_tasks) {
+    for (auto& task : read_tasks) {
+      task.wait();
+    }
+  };
+  return std::async(std::launch::deferred, sync_fn, std::move(read_tasks));
+}
+
+/**
+ * @brief Return the number of total pages from the given column chunks.
+ *
+ * @param chunks List of column chunk descriptors
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return The total number of pages
+ */
+[[nodiscard]] size_t count_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                                        rmm::cuda_stream_view stream)
+{
+  size_t total_pages = 0;
+
+  chunks.host_to_device(stream);
+  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  chunks.device_to_host(stream, true);
+
+  for (size_t c = 0; c < chunks.size(); c++) {
+    total_pages += chunks[c].num_data_pages + chunks[c].num_dict_pages;
+  }
+
+  return total_pages;
+}
+
+/**
+ * @brief Decode the page information from the given column chunks.
+ *
+ * @param chunks List of column chunk descriptors
+ * @param pages List of page information
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
+void decode_page_headers(hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+                         hostdevice_vector<gpu::PageInfo>& pages,
+                         rmm::cuda_stream_view stream)
+{
+  // IMPORTANT : if you change how pages are stored within a chunk (dist pages, then data pages),
+  // please update preprocess_nested_columns to reflect this.
+  for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
+    chunks[c].max_num_pages = chunks[c].num_data_pages + chunks[c].num_dict_pages;
+    chunks[c].page_info     = pages.device_ptr(page_count);
+    page_count += chunks[c].max_num_pages;
+  }
+
+  chunks.host_to_device(stream);
+  gpu::DecodePageHeaders(chunks.device_ptr(), chunks.size(), stream);
+  pages.device_to_host(stream, true);
+}
+
+/**
+ * @brief Decompresses the page data, at page granularity.
+ *
+ * @param chunks List of column chunk descriptors
+ * @param pages List of page information
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ *
+ * @return Device buffer to decompressed page data
+ */
+[[nodiscard]] rmm::device_buffer decompress_page_data(
+  hostdevice_vector<gpu::ColumnChunkDesc>& chunks,
+  hostdevice_vector<gpu::PageInfo>& pages,
+  rmm::cuda_stream_view stream)
+{
+  auto for_each_codec_page = [&](parquet::Compression codec, const std::function<void(size_t)>& f) {
+    for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
+      const auto page_stride = chunks[c].max_num_pages;
+      if (chunks[c].codec == codec) {
+        for (int k = 0; k < page_stride; k++) {
+          f(page_count + k);
+        }
+      }
+      page_count += page_stride;
+    }
+  };
+
+  // Brotli scratch memory for decompressing
+  rmm::device_buffer debrotli_scratch;
+
+  // Count the exact number of compressed pages
+  size_t num_comp_pages    = 0;
+  size_t total_decomp_size = 0;
+
+  struct codec_stats {
+    parquet::Compression compression_type = UNCOMPRESSED;
+    size_t num_pages                      = 0;
+    int32_t max_decompressed_size         = 0;
+    size_t total_decomp_size              = 0;
+  };
+
+  std::array codecs{codec_stats{parquet::GZIP},
+                    codec_stats{parquet::SNAPPY},
+                    codec_stats{parquet::BROTLI},
+                    codec_stats{parquet::ZSTD}};
+
+  auto is_codec_supported = [&codecs](int8_t codec) {
+    if (codec == parquet::UNCOMPRESSED) return true;
+    return std::find_if(codecs.begin(), codecs.end(), [codec](auto& cstats) {
+             return codec == cstats.compression_type;
+           }) != codecs.end();
+  };
+  CUDF_EXPECTS(std::all_of(chunks.begin(),
+                           chunks.end(),
+                           [&is_codec_supported](auto const& chunk) {
+                             return is_codec_supported(chunk.codec);
+                           }),
+               "Unsupported compression type");
+
+  for (auto& codec : codecs) {
+    for_each_codec_page(codec.compression_type, [&](size_t page) {
+      auto page_uncomp_size = pages[page].uncompressed_page_size;
+      total_decomp_size += page_uncomp_size;
+      codec.total_decomp_size += page_uncomp_size;
+      codec.max_decompressed_size = std::max(codec.max_decompressed_size, page_uncomp_size);
+      codec.num_pages++;
+      num_comp_pages++;
+    });
+    if (codec.compression_type == parquet::BROTLI && codec.num_pages > 0) {
+      debrotli_scratch.resize(get_gpu_debrotli_scratch_size(codec.num_pages), stream);
+    }
+  }
+
+  // Dispatch batches of pages to decompress for each codec
+  rmm::device_buffer decomp_pages(total_decomp_size, stream);
+
+  std::vector<device_span<uint8_t const>> comp_in;
+  comp_in.reserve(num_comp_pages);
+  std::vector<device_span<uint8_t>> comp_out;
+  comp_out.reserve(num_comp_pages);
+
+  // vectors to save v2 def and rep level data, if any
+  std::vector<device_span<uint8_t const>> copy_in;
+  copy_in.reserve(num_comp_pages);
+  std::vector<device_span<uint8_t>> copy_out;
+  copy_out.reserve(num_comp_pages);
+
+  rmm::device_uvector<compression_result> comp_res(num_comp_pages, stream);
+  thrust::fill(rmm::exec_policy(stream),
+               comp_res.begin(),
+               comp_res.end(),
+               compression_result{0, compression_status::FAILURE});
+
+  size_t decomp_offset = 0;
+  int32_t start_pos    = 0;
+  for (const auto& codec : codecs) {
+    if (codec.num_pages == 0) { continue; }
+
+    for_each_codec_page(codec.compression_type, [&](size_t page_idx) {
+      auto const dst_base = static_cast<uint8_t*>(decomp_pages.data()) + decomp_offset;
+      auto& page          = pages[page_idx];
+      // offset will only be non-zero for V2 pages
+      auto const offset = page.def_lvl_bytes + page.rep_lvl_bytes;
+      // for V2 need to copy def and rep level info into place, and then offset the
+      // input and output buffers. otherwise we'd have to keep both the compressed
+      // and decompressed data.
+      if (offset != 0) {
+        copy_in.emplace_back(page.page_data, offset);
+        copy_out.emplace_back(dst_base, offset);
+      }
+      comp_in.emplace_back(page.page_data + offset,
+                           static_cast<size_t>(page.compressed_page_size - offset));
+      comp_out.emplace_back(dst_base + offset,
+                            static_cast<size_t>(page.uncompressed_page_size - offset));
+      page.page_data = dst_base;
+      decomp_offset += page.uncompressed_page_size;
+    });
+
+    host_span<device_span<uint8_t const> const> comp_in_view{comp_in.data() + start_pos,
+                                                             codec.num_pages};
+    auto const d_comp_in = cudf::detail::make_device_uvector_async(comp_in_view, stream);
+    host_span<device_span<uint8_t> const> comp_out_view(comp_out.data() + start_pos,
+                                                        codec.num_pages);
+    auto const d_comp_out = cudf::detail::make_device_uvector_async(comp_out_view, stream);
+    device_span<compression_result> d_comp_res_view(comp_res.data() + start_pos, codec.num_pages);
+
+    switch (codec.compression_type) {
+      case parquet::GZIP:
+        gpuinflate(d_comp_in, d_comp_out, d_comp_res_view, gzip_header_included::YES, stream);
+        break;
+      case parquet::SNAPPY:
+        if (nvcomp_integration::is_stable_enabled()) {
+          nvcomp::batched_decompress(nvcomp::compression_type::SNAPPY,
+                                     d_comp_in,
+                                     d_comp_out,
+                                     d_comp_res_view,
+                                     codec.max_decompressed_size,
+                                     codec.total_decomp_size,
+                                     stream);
+        } else {
+          gpu_unsnap(d_comp_in, d_comp_out, d_comp_res_view, stream);
+        }
+        break;
+      case parquet::ZSTD:
+        nvcomp::batched_decompress(nvcomp::compression_type::ZSTD,
+                                   d_comp_in,
+                                   d_comp_out,
+                                   d_comp_res_view,
+                                   codec.max_decompressed_size,
+                                   codec.total_decomp_size,
+                                   stream);
+        break;
+      case parquet::BROTLI:
+        gpu_debrotli(d_comp_in,
+                     d_comp_out,
+                     d_comp_res_view,
+                     debrotli_scratch.data(),
+                     debrotli_scratch.size(),
+                     stream);
+        break;
+      default: CUDF_FAIL("Unexpected decompression dispatch"); break;
+    }
+    start_pos += codec.num_pages;
+  }
+
+  CUDF_EXPECTS(thrust::all_of(rmm::exec_policy(stream),
+                              comp_res.begin(),
+                              comp_res.end(),
+                              [] __device__(auto const& res) {
+                                return res.status == compression_status::SUCCESS;
+                              }),
+               "Error during decompression");
+
+  // now copy the uncompressed V2 def and rep level data
+  if (not copy_in.empty()) {
+    auto const d_copy_in  = cudf::detail::make_device_uvector_async(copy_in, stream);
+    auto const d_copy_out = cudf::detail::make_device_uvector_async(copy_out, stream);
+
+    gpu_copy_uncompressed_blocks(d_copy_in, d_copy_out, stream);
+    stream.synchronize();
+  }
+
+  // Update the page information in device memory with the updated value of
+  // page_data; it now points to the uncompressed data buffer
+  pages.host_to_device(stream);
+
+  return decomp_pages;
+}
+
+}  // namespace
+
+void reader::impl::allocate_nesting_info()
+{
+  auto const& chunks      = _file_itm_data.chunks;
+  auto& pages             = _file_itm_data.pages_info;
+  auto& page_nesting_info = _file_itm_data.page_nesting_info;
+
+  // compute total # of page_nesting infos needed and allocate space. doing this in one
+  // buffer to keep it to a single gpu allocation
+  size_t const total_page_nesting_infos = std::accumulate(
+    chunks.host_ptr(), chunks.host_ptr() + chunks.size(), 0, [&](int total, auto& chunk) {
+      // the schema of the input column
+      auto const& schema                    = _metadata->get_schema(chunk.src_col_schema);
+      auto const per_page_nesting_info_size = max(
+        schema.max_definition_level + 1, _metadata->get_output_nesting_depth(chunk.src_col_schema));
+      return total + (per_page_nesting_info_size * chunk.num_data_pages);
+    });
+
+  page_nesting_info = hostdevice_vector<gpu::PageNestingInfo>{total_page_nesting_infos, _stream};
+
+  // retrieve from the gpu so we can update
+  pages.device_to_host(_stream, true);
+
+  // update pointers in the PageInfos
+  int target_page_index = 0;
+  int src_info_index    = 0;
+  for (size_t idx = 0; idx < chunks.size(); idx++) {
+    int src_col_schema                    = chunks[idx].src_col_schema;
+    auto& schema                          = _metadata->get_schema(src_col_schema);
+    auto const per_page_nesting_info_size = std::max(
+      schema.max_definition_level + 1, _metadata->get_output_nesting_depth(src_col_schema));
+
+    // skip my dict pages
+    target_page_index += chunks[idx].num_dict_pages;
+    for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
+      pages[target_page_index + p_idx].nesting = page_nesting_info.device_ptr() + src_info_index;
+      pages[target_page_index + p_idx].num_nesting_levels = per_page_nesting_info_size;
+
+      src_info_index += per_page_nesting_info_size;
+    }
+    target_page_index += chunks[idx].num_data_pages;
+  }
+
+  // copy back to the gpu
+  pages.host_to_device(_stream);
+
+  // fill in
+  int nesting_info_index = 0;
+  std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
+  for (size_t idx = 0; idx < chunks.size(); idx++) {
+    int src_col_schema = chunks[idx].src_col_schema;
+
+    // schema of the input column
+    auto& schema = _metadata->get_schema(src_col_schema);
+    // real depth of the output cudf column hierarchy (1 == no nesting, 2 == 1 level, etc)
+    int max_depth = _metadata->get_output_nesting_depth(src_col_schema);
+
+    // # of nesting infos stored per page for this column
+    auto const per_page_nesting_info_size = std::max(schema.max_definition_level + 1, max_depth);
+
+    // if this column has lists, generate depth remapping
+    std::map<int, std::pair<std::vector<int>, std::vector<int>>> depth_remapping;
+    if (schema.max_repetition_level > 0) {
+      generate_depth_remappings(depth_remapping, src_col_schema, *_metadata);
+    }
+
+    // fill in host-side nesting info
+    int schema_idx  = src_col_schema;
+    auto cur_schema = _metadata->get_schema(schema_idx);
+    int cur_depth   = max_depth - 1;
+    while (schema_idx > 0) {
+      // stub columns (basically the inner field of a list scheme element) are not real columns.
+      // we can ignore them for the purposes of output nesting info
+      if (!cur_schema.is_stub()) {
+        // initialize each page within the chunk
+        for (int p_idx = 0; p_idx < chunks[idx].num_data_pages; p_idx++) {
+          gpu::PageNestingInfo* pni =
+            &page_nesting_info[nesting_info_index + (p_idx * per_page_nesting_info_size)];
+
+          // if we have lists, set our start and end depth remappings
+          if (schema.max_repetition_level > 0) {
+            auto remap = depth_remapping.find(src_col_schema);
+            CUDF_EXPECTS(remap != depth_remapping.end(),
+                         "Could not find depth remapping for schema");
+            std::vector<int> const& rep_depth_remap = (remap->second.first);
+            std::vector<int> const& def_depth_remap = (remap->second.second);
+
+            for (size_t m = 0; m < rep_depth_remap.size(); m++) {
+              pni[m].start_depth = rep_depth_remap[m];
+            }
+            for (size_t m = 0; m < def_depth_remap.size(); m++) {
+              pni[m].end_depth = def_depth_remap[m];
+            }
+          }
+
+          // values indexed by output column index
+          pni[cur_depth].max_def_level = cur_schema.max_definition_level;
+          pni[cur_depth].max_rep_level = cur_schema.max_repetition_level;
+          pni[cur_depth].size          = 0;
+          pni[cur_depth].type =
+            to_type_id(cur_schema, _strings_to_categorical, _timestamp_type.id());
+          pni[cur_depth].nullable = cur_schema.repetition_type == OPTIONAL;
+        }
+
+        // move up the hierarchy
+        cur_depth--;
+      }
+
+      // next schema
+      schema_idx = cur_schema.parent_idx;
+      cur_schema = _metadata->get_schema(schema_idx);
+    }
+
+    nesting_info_index += (per_page_nesting_info_size * chunks[idx].num_data_pages);
+  }
+
+  // copy nesting info to the device
+  page_nesting_info.host_to_device(_stream);
+}
+
+void reader::impl::load_and_decompress_data(std::vector<row_group_info> const& row_groups_info,
+                                            size_type num_rows)
+{
+  // This function should never be called if `num_rows == 0`.
+  CUDF_EXPECTS(num_rows > 0, "Number of reading rows must not be zero.");
+
+  auto& raw_page_data    = _file_itm_data.raw_page_data;
+  auto& decomp_page_data = _file_itm_data.decomp_page_data;
+  auto& chunks           = _file_itm_data.chunks;
+  auto& pages_info       = _file_itm_data.pages_info;
+
+  // Descriptors for all the chunks that make up the selected columns
+  const auto num_input_columns = _input_columns.size();
+  const auto num_chunks        = row_groups_info.size() * num_input_columns;
+  chunks                       = hostdevice_vector<gpu::ColumnChunkDesc>(0, num_chunks, _stream);
+
+  // Association between each column chunk and its source
+  std::vector<size_type> chunk_source_map(num_chunks);
+
+  // Tracker for eventually deallocating compressed and uncompressed data
+  raw_page_data = std::vector<std::unique_ptr<datasource::buffer>>(num_chunks);
+
+  // Keep track of column chunk file offsets
+  std::vector<size_t> column_chunk_offsets(num_chunks);
+
+  // Initialize column chunk information
+  size_t total_decompressed_size = 0;
+  auto remaining_rows            = num_rows;
+  std::vector<std::future<void>> read_rowgroup_tasks;
+  for (const auto& rg : row_groups_info) {
+    const auto& row_group       = _metadata->get_row_group(rg.index, rg.source_index);
+    auto const row_group_start  = rg.start_row;
+    auto const row_group_source = rg.source_index;
+    auto const row_group_rows   = std::min<int>(remaining_rows, row_group.num_rows);
+    auto const io_chunk_idx     = chunks.size();
+
+    // generate ColumnChunkDesc objects for everything to be decoded (all input columns)
+    for (size_t i = 0; i < num_input_columns; ++i) {
+      auto col = _input_columns[i];
+      // look up metadata
+      auto& col_meta = _metadata->get_column_metadata(rg.index, rg.source_index, col.schema_idx);
+      auto& schema   = _metadata->get_schema(col.schema_idx);
+
+      auto [type_width, clock_rate, converted_type] =
+        conversion_info(to_type_id(schema, _strings_to_categorical, _timestamp_type.id()),
+                        _timestamp_type.id(),
+                        schema.type,
+                        schema.converted_type,
+                        schema.type_length);
+
+      column_chunk_offsets[chunks.size()] =
+        (col_meta.dictionary_page_offset != 0)
+          ? std::min(col_meta.data_page_offset, col_meta.dictionary_page_offset)
+          : col_meta.data_page_offset;
+
+      chunks.push_back(gpu::ColumnChunkDesc(col_meta.total_compressed_size,
+                                            nullptr,
+                                            col_meta.num_values,
+                                            schema.type,
+                                            type_width,
+                                            row_group_start,
+                                            row_group_rows,
+                                            schema.max_definition_level,
+                                            schema.max_repetition_level,
+                                            _metadata->get_output_nesting_depth(col.schema_idx),
+                                            required_bits(schema.max_definition_level),
+                                            required_bits(schema.max_repetition_level),
+                                            col_meta.codec,
+                                            converted_type,
+                                            schema.logical_type,
+                                            schema.decimal_scale,
+                                            clock_rate,
+                                            i,
+                                            col.schema_idx));
+
+      // Map each column chunk to its column index and its source index
+      chunk_source_map[chunks.size() - 1] = row_group_source;
+
+      if (col_meta.codec != Compression::UNCOMPRESSED) {
+        total_decompressed_size += col_meta.total_uncompressed_size;
+      }
+    }
+    // Read compressed chunk data to device memory
+    read_rowgroup_tasks.push_back(read_column_chunks_async(_sources,
+                                                           raw_page_data,
+                                                           chunks,
+                                                           io_chunk_idx,
+                                                           chunks.size(),
+                                                           column_chunk_offsets,
+                                                           chunk_source_map,
+                                                           _stream));
+
+    remaining_rows -= row_group.num_rows;
+  }
+  for (auto& task : read_rowgroup_tasks) {
+    task.wait();
+  }
+
+  CUDF_EXPECTS(remaining_rows <= 0, "All rows data must be read.");
+
+  // Process dataset chunk pages into output columns
+  auto const total_pages = count_page_headers(chunks, _stream);
+  pages_info             = hostdevice_vector<gpu::PageInfo>(total_pages, total_pages, _stream);
+
+  if (total_pages > 0) {
+    // decoding of column/page information
+    decode_page_headers(chunks, pages_info, _stream);
+    if (total_decompressed_size > 0) {
+      decomp_page_data = decompress_page_data(chunks, pages_info, _stream);
+      // Free compressed data
+      for (size_t c = 0; c < chunks.size(); c++) {
+        if (chunks[c].codec != parquet::Compression::UNCOMPRESSED) {
+          raw_page_data[c].reset();
+          // TODO: Check if this is called
+        }
+      }
+    }
+
+    // build output column info
+    // walk the schema, building out_buffers that mirror what our final cudf columns will look
+    // like. important : there is not necessarily a 1:1 mapping between input columns and output
+    // columns. For example, parquet does not explicitly store a ColumnChunkDesc for struct
+    // columns. The "structiness" is simply implied by the schema.  For example, this schema:
+    //  required group field_id=1 name {
+    //    required binary field_id=2 firstname (String);
+    //    required binary field_id=3 middlename (String);
+    //    required binary field_id=4 lastname (String);
+    // }
+    // will only contain 3 columns of data (firstname, middlename, lastname).  But of course
+    // "name" is a struct column that we want to return, so we have to make sure that we
+    // create it ourselves.
+    // std::vector<output_column_info> output_info = build_output_column_info();
+
+    // nesting information (sizes, etc) stored -per page-
+    // note : even for flat schemas, we allocate 1 level of "nesting" info
+    allocate_nesting_info();
+  }
+}
+
+namespace {
+
+struct cumulative_row_info {
+  size_t row_count;   // cumulative row count
+  size_t size_bytes;  // cumulative size in bytes
+  int key;            // schema index
+};
+
+#if defined(PREPROCESS_DEBUG)
+void print_pages(hostdevice_vector<gpu::PageInfo>& pages, rmm::cuda_stream_view _stream)
+{
+  pages.device_to_host(_stream, true);
+  for (size_t idx = 0; idx < pages.size(); idx++) {
+    auto const& p = pages[idx];
+    // skip dictionary pages
+    if (p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) { continue; }
+    printf(
+      "P(%lu, s:%d): chunk_row(%d), num_rows(%d), skipped_values(%d), skipped_leaf_values(%d)\n",
+      idx,
+      p.src_col_schema,
+      p.chunk_row,
+      p.num_rows,
+      p.skipped_values,
+      p.skipped_leaf_values);
+  }
+}
+
+void print_cumulative_page_info(hostdevice_vector<gpu::PageInfo>& pages,
+                                rmm::device_uvector<int32_t> const& page_index,
+                                rmm::device_uvector<cumulative_row_info> const& c_info,
+                                rmm::cuda_stream_view stream)
+{
+  pages.device_to_host(stream, true);
+
+  printf("------------\nCumulative sizes by page\n");
+
+  std::vector<int> schemas(pages.size());
+  std::vector<int> h_page_index(pages.size());
+  cudaMemcpy(
+    h_page_index.data(), page_index.data(), sizeof(int) * pages.size(), cudaMemcpyDeviceToHost);
+  std::vector<cumulative_row_info> h_cinfo(pages.size());
+  cudaMemcpy(h_cinfo.data(),
+             c_info.data(),
+             sizeof(cumulative_row_info) * pages.size(),
+             cudaMemcpyDeviceToHost);
+  auto schema_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](size_type i) { return pages[h_page_index[i]].src_col_schema; });
+  thrust::copy(thrust::seq, schema_iter, schema_iter + pages.size(), schemas.begin());
+  auto last = thrust::unique(thrust::seq, schemas.begin(), schemas.end());
+  schemas.resize(last - schemas.begin());
+  printf("Num schemas: %lu\n", schemas.size());
+
+  for (size_t idx = 0; idx < schemas.size(); idx++) {
+    printf("Schema %d\n", schemas[idx]);
+    for (size_t pidx = 0; pidx < pages.size(); pidx++) {
+      auto const& page = pages[h_page_index[pidx]];
+      if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY || page.src_col_schema != schemas[idx]) {
+        continue;
+      }
+      printf("\tP: {%lu, %lu}\n", h_cinfo[pidx].row_count, h_cinfo[pidx].size_bytes);
+    }
+  }
+}
+
+void print_cumulative_row_info(
+  host_span<cumulative_row_info const> sizes,
+  std::string const& label,
+  std::optional<std::vector<gpu::chunk_read_info>> splits = std::nullopt)
+{
+  if (splits.has_value()) {
+    printf("------------\nSplits\n");
+    for (size_t idx = 0; idx < splits->size(); idx++) {
+      printf("{%lu, %lu}\n", splits.value()[idx].skip_rows, splits.value()[idx].num_rows);
+    }
+  }
+
+  printf("------------\nCumulative sizes %s\n", label.c_str());
+  for (size_t idx = 0; idx < sizes.size(); idx++) {
+    printf("{%lu, %lu, %d}", sizes[idx].row_count, sizes[idx].size_bytes, sizes[idx].key);
+    if (splits.has_value()) {
+      // if we have a split at this row count and this is the last instance of this row count
+      auto start = thrust::make_transform_iterator(
+        splits->begin(), [](gpu::chunk_read_info const& i) { return i.skip_rows; });
+      auto end               = start + splits->size();
+      auto split             = std::find(start, end, sizes[idx].row_count);
+      auto const split_index = [&]() -> int {
+        if (split != end &&
+            ((idx == sizes.size() - 1) || (sizes[idx + 1].row_count > sizes[idx].row_count))) {
+          return static_cast<int>(std::distance(start, split));
+        }
+        return idx == 0 ? 0 : -1;
+      }();
+      if (split_index >= 0) {
+        printf(" <-- split {%lu, %lu}",
+               splits.value()[split_index].skip_rows,
+               splits.value()[split_index].num_rows);
+      }
+    }
+    printf("\n");
+  }
+}
+#endif  // PREPROCESS_DEBUG
+
+/**
+ * @brief Functor which reduces two cumulative_row_info structs of the same key.
+ */
+struct cumulative_row_sum {
+  cumulative_row_info operator()
+    __device__(cumulative_row_info const& a, cumulative_row_info const& b) const
+  {
+    return cumulative_row_info{a.row_count + b.row_count, a.size_bytes + b.size_bytes, a.key};
+  }
+};
+
+/**
+ * @brief Functor which computes the total data size for a given type of cudf column.
+ *
+ * In the case of strings, the return size does not include the chars themselves. That
+ * information is tracked separately (see PageInfo::str_bytes).
+ */
+struct row_size_functor {
+  __device__ size_t validity_size(size_t num_rows, bool nullable)
+  {
+    return nullable ? (cudf::util::div_rounding_up_safe(num_rows, size_t{32}) * 4) : 0;
+  }
+
+  template <typename T>
+  __device__ size_t operator()(size_t num_rows, bool nullable)
+  {
+    auto const element_size = sizeof(device_storage_type_t<T>);
+    return (element_size * num_rows) + validity_size(num_rows, nullable);
+  }
+};
+
+template <>
+__device__ size_t row_size_functor::operator()<list_view>(size_t num_rows, bool nullable)
+{
+  auto const offset_size = sizeof(offset_type);
+  // NOTE: Adding the + 1 offset here isn't strictly correct.  There will only be 1 extra offset
+  // for the entire column, whereas this is adding an extra offset per page.  So we will get a
+  // small over-estimate of the real size of the order :  # of pages * 4 bytes. It seems better
+  // to overestimate size somewhat than to underestimate it and potentially generate chunks
+  // that are too large.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<struct_view>(size_t num_rows, bool nullable)
+{
+  return validity_size(num_rows, nullable);
+}
+
+template <>
+__device__ size_t row_size_functor::operator()<string_view>(size_t num_rows, bool nullable)
+{
+  // only returns the size of offsets and validity. the size of the actual string chars
+  // is tracked separately.
+  auto const offset_size = sizeof(offset_type);
+  // see note about offsets in the list_view template.
+  return (offset_size * (num_rows + 1)) + validity_size(num_rows, nullable);
+}
+
+/**
+ * @brief Functor which computes the total output cudf data size for all of
+ * the data in this page.
+ *
+ * Sums across all nesting levels.
+ */
+struct get_cumulative_row_info {
+  gpu::PageInfo const* const pages;
+
+  __device__ cumulative_row_info operator()(size_type index)
+  {
+    auto const& page = pages[index];
+    if (page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) {
+      return cumulative_row_info{0, 0, page.src_col_schema};
+    }
+
+    // total nested size, not counting string data
+    auto iter =
+      cudf::detail::make_counting_transform_iterator(0, [page, index] __device__(size_type i) {
+        auto const& pni = page.nesting[i];
+        return cudf::type_dispatcher(
+          data_type{pni.type}, row_size_functor{}, pni.size, pni.nullable);
+      });
+
+    size_t const row_count = static_cast<size_t>(page.nesting[0].size);
+    return {row_count,
+            thrust::reduce(thrust::seq, iter, iter + page.num_nesting_levels) + page.str_bytes,
+            page.src_col_schema};
+  }
+};
+
+/**
+ * @brief Functor which computes the effective size of all input columns by page.
+ *
+ * For a given row, we want to find the cost of all pages for all columns involved
+ * in loading up to that row.  The complication here is that not all pages are the
+ * same size between columns. Example:
+ *
+ *              page row counts
+ * Column A:    0 <----> 100 <----> 200
+ * Column B:    0 <---------------> 200 <--------> 400
+                          |
+ * if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+ * at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+ * page. Essentially, a conservative over-estimate of the real size.
+ */
+struct row_total_size {
+  cumulative_row_info const* c_info;
+  size_type const* key_offsets;
+  size_t num_keys;
+
+  __device__ cumulative_row_info operator()(cumulative_row_info const& i)
+  {
+    // sum sizes for each input column at this row
+    size_t sum = 0;
+    for (int idx = 0; idx < num_keys; idx++) {
+      auto const start = key_offsets[idx];
+      auto const end   = key_offsets[idx + 1];
+      auto iter        = cudf::detail::make_counting_transform_iterator(
+        0, [&] __device__(size_type i) { return c_info[i].row_count; });
+      auto const page_index =
+        thrust::lower_bound(thrust::seq, iter + start, iter + end, i.row_count) - iter;
+      sum += c_info[page_index].size_bytes;
+    }
+    return {i.row_count, sum, i.key};
+  }
+};
+
+/**
+ * @brief Given a vector of cumulative {row_count, byte_size} pairs and a chunk read
+ * limit, determine the set of splits.
+ *
+ * @param sizes Vector of cumulative {row_count, byte_size} pairs
+ * @param num_rows Total number of rows to read
+ * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
+ */
+std::vector<gpu::chunk_read_info> find_splits(std::vector<cumulative_row_info> const& sizes,
+                                              size_t num_rows,
+                                              size_t chunk_read_limit)
+{
+  // now we have an array of {row_count, real output bytes}. just walk through it and generate
+  // splits.
+  // TODO: come up with a clever way to do this entirely in parallel. For now, as long as batch
+  // sizes are reasonably large, this shouldn't iterate too many times
+  std::vector<gpu::chunk_read_info> splits;
+  {
+    size_t cur_pos             = 0;
+    size_t cur_cumulative_size = 0;
+    size_t cur_row_count       = 0;
+    auto start = thrust::make_transform_iterator(sizes.begin(), [&](cumulative_row_info const& i) {
+      return i.size_bytes - cur_cumulative_size;
+    });
+    auto end   = start + sizes.size();
+    while (cur_row_count < num_rows) {
+      int64_t split_pos =
+        thrust::lower_bound(thrust::seq, start + cur_pos, end, chunk_read_limit) - start;
+
+      // if we're past the end, or if the returned bucket is > than the chunk_read_limit, move back
+      // one.
+      if (static_cast<size_t>(split_pos) >= sizes.size() ||
+          (sizes[split_pos].size_bytes - cur_cumulative_size > chunk_read_limit)) {
+        split_pos--;
+      }
+
+      // best-try. if we can't find something that'll fit, we have to go bigger. we're doing this in
+      // a loop because all of the cumulative sizes for all the pages are sorted into one big list.
+      // so if we had two columns, both of which had an entry {1000, 10000}, that entry would be in
+      // the list twice. so we have to iterate until we skip past all of them.  The idea is that we
+      // either do this, or we have to call unique() on the input first.
+      while (split_pos < (static_cast<int64_t>(sizes.size()) - 1) &&
+             (split_pos < 0 || sizes[split_pos].row_count == cur_row_count)) {
+        split_pos++;
+      }
+
+      auto const start_row = cur_row_count;
+      cur_row_count        = sizes[split_pos].row_count;
+      splits.push_back(gpu::chunk_read_info{start_row, cur_row_count - start_row});
+      cur_pos             = split_pos;
+      cur_cumulative_size = sizes[split_pos].size_bytes;
+    }
+  }
+  // print_cumulative_row_info(sizes, "adjusted", splits);
+
+  return splits;
+}
+
+/**
+ * @brief Given a set of pages that have had their sizes computed by nesting level and
+ * a limit on total read size, generate a set of {skip_rows, num_rows} pairs representing
+ * a set of reads that will generate output columns of total size <= `chunk_read_limit` bytes.
+ *
+ * @param pages All pages in the file
+ * @param id Additional intermediate information required to process the pages
+ * @param num_rows Total number of rows to read
+ * @param chunk_read_limit Limit on total number of bytes to be returned per read, for all columns
+ * @param stream CUDA stream to use, default 0
+ */
+std::vector<gpu::chunk_read_info> compute_splits(hostdevice_vector<gpu::PageInfo>& pages,
+                                                 gpu::chunk_intermediate_data const& id,
+                                                 size_t num_rows,
+                                                 size_t chunk_read_limit,
+                                                 rmm::cuda_stream_view stream)
+{
+  auto const& page_keys  = id.page_keys;
+  auto const& page_index = id.page_index;
+
+  // generate cumulative row counts and sizes
+  rmm::device_uvector<cumulative_row_info> c_info(page_keys.size(), stream);
+  // convert PageInfo to cumulative_row_info
+  auto page_input = thrust::make_transform_iterator(page_index.begin(),
+                                                    get_cumulative_row_info{pages.device_ptr()});
+  thrust::inclusive_scan_by_key(rmm::exec_policy(stream),
+                                page_keys.begin(),
+                                page_keys.end(),
+                                page_input,
+                                c_info.begin(),
+                                thrust::equal_to{},
+                                cumulative_row_sum{});
+  // print_cumulative_page_info(pages, page_index, c_info, stream);
+
+  // sort by row count
+  rmm::device_uvector<cumulative_row_info> c_info_sorted{c_info, stream};
+  thrust::sort(rmm::exec_policy(stream),
+               c_info_sorted.begin(),
+               c_info_sorted.end(),
+               [] __device__(cumulative_row_info const& a, cumulative_row_info const& b) {
+                 return a.row_count < b.row_count;
+               });
+
+  std::vector<cumulative_row_info> h_c_info_sorted(c_info_sorted.size());
+  cudaMemcpy(h_c_info_sorted.data(),
+             c_info_sorted.data(),
+             sizeof(cumulative_row_info) * c_info_sorted.size(),
+             cudaMemcpyDeviceToHost);
+  // print_cumulative_row_info(h_c_info_sorted, "raw");
+
+  // generate key offsets (offsets to the start of each partition of keys). worst case is 1 page per
+  // key
+  rmm::device_uvector<size_type> key_offsets(page_keys.size() + 1, stream);
+  auto const key_offsets_end = thrust::reduce_by_key(rmm::exec_policy(stream),
+                                                     page_keys.begin(),
+                                                     page_keys.end(),
+                                                     thrust::make_constant_iterator(1),
+                                                     thrust::make_discard_iterator(),
+                                                     key_offsets.begin())
+                                 .second;
+  size_t const num_unique_keys = key_offsets_end - key_offsets.begin();
+  thrust::exclusive_scan(
+    rmm::exec_policy(stream), key_offsets.begin(), key_offsets.end(), key_offsets.begin());
+
+  // adjust the cumulative info such that for each row count, the size includes any pages that span
+  // that row count. this is so that if we have this case:
+  //              page row counts
+  // Column A:    0 <----> 100 <----> 200
+  // Column B:    0 <---------------> 200 <--------> 400
+  //                        |
+  // if we decide to split at row 100, we don't really know the actual amount of bytes in column B
+  // at that point.  So we have to proceed as if we are taking the bytes from all 200 rows of that
+  // page.
+  //
+  rmm::device_uvector<cumulative_row_info> aggregated_info(c_info.size(), stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    c_info_sorted.begin(),
+                    c_info_sorted.end(),
+                    aggregated_info.begin(),
+                    row_total_size{c_info.data(), key_offsets.data(), num_unique_keys});
+
+  // bring back to the cpu
+  std::vector<cumulative_row_info> h_aggregated_info(aggregated_info.size());
+  cudaMemcpyAsync(h_aggregated_info.data(),
+                  aggregated_info.data(),
+                  sizeof(cumulative_row_info) * c_info.size(),
+                  cudaMemcpyDeviceToHost,
+                  stream);
+  stream.synchronize();
+
+  return find_splits(h_aggregated_info, num_rows, chunk_read_limit);
+}
+
+struct get_page_chunk_idx {
+  __device__ size_type operator()(gpu::PageInfo const& page) { return page.chunk_idx; }
+};
+
+struct get_page_num_rows {
+  __device__ size_type operator()(gpu::PageInfo const& page) { return page.num_rows; }
+};
+
+struct get_page_schema {
+  __device__ size_type operator()(gpu::PageInfo const& page) { return page.src_col_schema; }
+};
+
+/**
+ * @brief Returns the size field of a PageInfo struct for a given depth, keyed by schema.
+ */
+struct get_page_nesting_size {
+  size_type const src_col_schema;
+  size_type const depth;
+  gpu::PageInfo const* const pages;
+
+  __device__ size_type operator()(int index) const
+  {
+    auto const& page = pages[index];
+    if (page.src_col_schema != src_col_schema || page.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) {
+      return 0;
+    }
+    return page.nesting[depth].batch_size;
+  }
+};
+
+/**
+ * @brief Writes to the chunk_row field of the PageInfo struct.
+ */
+struct chunk_row_output_iter {
+  gpu::PageInfo* p;
+  using value_type        = size_type;
+  using difference_type   = size_type;
+  using pointer           = size_type*;
+  using reference         = size_type&;
+  using iterator_category = thrust::output_device_iterator_tag;
+
+  __host__ __device__ chunk_row_output_iter operator+(int i)
+  {
+    return chunk_row_output_iter{p + i};
+  }
+
+  __host__ __device__ void operator++() { p++; }
+
+  __device__ reference operator[](int i) { return p[i].chunk_row; }
+  __device__ reference operator*() { return p->chunk_row; }
+};
+
+/**
+ * @brief Writes to the page_start_value field of the PageNestingInfo struct, keyed by schema.
+ */
+struct start_offset_output_iterator {
+  gpu::PageInfo* pages;
+  int const* page_indices;
+  int cur_index;
+  int src_col_schema;
+  int nesting_depth;
+  int empty               = 0;
+  using value_type        = size_type;
+  using difference_type   = size_type;
+  using pointer           = size_type*;
+  using reference         = size_type&;
+  using iterator_category = thrust::output_device_iterator_tag;
+
+  constexpr void operator=(start_offset_output_iterator const& other)
+  {
+    pages          = other.pages;
+    page_indices   = other.page_indices;
+    cur_index      = other.cur_index;
+    src_col_schema = other.src_col_schema;
+    nesting_depth  = other.nesting_depth;
+  }
+
+  constexpr start_offset_output_iterator operator+(int i)
+  {
+    return start_offset_output_iterator{
+      pages, page_indices, cur_index + i, src_col_schema, nesting_depth};
+  }
+
+  constexpr void operator++() { cur_index++; }
+
+  __device__ reference operator[](int i) { return dereference(cur_index + i); }
+  __device__ reference operator*() { return dereference(cur_index); }
+
+ private:
+  __device__ reference dereference(int index)
+  {
+    gpu::PageInfo const& p = pages[page_indices[index]];
+    if (p.src_col_schema != src_col_schema || p.flags & gpu::PAGEINFO_FLAGS_DICTIONARY) {
+      return empty;
+    }
+    return p.nesting[nesting_depth].page_start_value;
+  }
+};
+
+}  // anonymous namespace
+
+void reader::impl::preprocess_pages(size_t skip_rows,
+                                    size_t num_rows,
+                                    bool uses_custom_row_bounds,
+                                    size_t chunk_read_limit)
+{
+  auto& chunks = _file_itm_data.chunks;
+  auto& pages  = _file_itm_data.pages_info;
+
+  // iterate over all input columns and determine if they contain lists so we can further
+  // preprocess them.
+  bool has_lists = false;
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    auto const& input_col  = _input_columns[idx];
+    size_t const max_depth = input_col.nesting_depth();
+
+    auto* cols = &_output_buffers;
+    for (size_t l_idx = 0; l_idx < max_depth; l_idx++) {
+      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
+      cols          = &out_buf.children;
+
+      // if this has a list parent, we have to get column sizes from the
+      // data computed during gpu::ComputePageSizes
+      if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
+        has_lists = true;
+        break;
+      }
+    }
+    if (has_lists) { break; }
+  }
+
+  // generate string dict indices if necessary
+  {
+    auto is_dict_chunk = [](const gpu::ColumnChunkDesc& chunk) {
+      return (chunk.data_type & 0x7) == BYTE_ARRAY && chunk.num_dict_pages > 0;
+    };
+
+    // Count the number of string dictionary entries
+    // NOTE: Assumes first page in the chunk is always the dictionary page
+    size_t total_str_dict_indexes = 0;
+    for (size_t c = 0, page_count = 0; c < chunks.size(); c++) {
+      if (is_dict_chunk(chunks[c])) {
+        total_str_dict_indexes += pages[page_count].num_input_values;
+      }
+      page_count += chunks[c].max_num_pages;
+    }
+
+    // Build index for string dictionaries since they can't be indexed
+    // directly due to variable-sized elements
+    _chunk_itm_data.str_dict_index =
+      cudf::detail::make_zeroed_device_uvector_async<string_index_pair>(total_str_dict_indexes,
+                                                                        _stream);
+
+    // Update chunks with pointers to string dict indices
+    for (size_t c = 0, page_count = 0, str_ofs = 0; c < chunks.size(); c++) {
+      input_column_info const& input_col = _input_columns[chunks[c].src_col_index];
+      CUDF_EXPECTS(input_col.schema_idx == chunks[c].src_col_schema,
+                   "Column/page schema index mismatch");
+      if (is_dict_chunk(chunks[c])) {
+        chunks[c].str_dict_index = _chunk_itm_data.str_dict_index.data() + str_ofs;
+        str_ofs += pages[page_count].num_input_values;
+      }
+
+      // column_data_base will always point to leaf data, even for nested types.
+      page_count += chunks[c].max_num_pages;
+    }
+
+    if (total_str_dict_indexes > 0) {
+      chunks.host_to_device(_stream);
+      gpu::BuildStringDictionaryIndex(chunks.device_ptr(), chunks.size(), _stream);
+    }
+  }
+
+  // intermediate data we will need for further chunked reads
+  if (has_lists || chunk_read_limit > 0) {
+    // computes:
+    // PageNestingInfo::num_rows for each page. the true number of rows (taking repetition into
+    // account), not just the number of values. PageNestingInfo::size for each level of nesting, for
+    // each page.
+    //
+    // we will be applying a later "trim" pass if skip_rows/num_rows is being used, which can happen
+    // if:
+    // - user has passed custom row bounds
+    // - we will be doing a chunked read
+    gpu::ComputePageSizes(pages,
+                          chunks,
+                          0,  // 0-max size_t. process all possible rows
+                          std::numeric_limits<size_t>::max(),
+                          true,                  // compute num_rows
+                          chunk_read_limit > 0,  // compute string sizes
+                          _stream);
+
+    // computes:
+    // PageInfo::chunk_row (the absolute start row index) for all pages
+    // Note: this is doing some redundant work for pages in flat hierarchies.  chunk_row has already
+    // been computed during header decoding. the overall amount of work here is very small though.
+    auto key_input  = thrust::make_transform_iterator(pages.device_ptr(), get_page_chunk_idx{});
+    auto page_input = thrust::make_transform_iterator(pages.device_ptr(), get_page_num_rows{});
+    thrust::exclusive_scan_by_key(rmm::exec_policy(_stream),
+                                  key_input,
+                                  key_input + pages.size(),
+                                  page_input,
+                                  chunk_row_output_iter{pages.device_ptr()});
+
+    // compute page ordering.
+    //
+    // ordering of pages is by input column schema, repeated across row groups.  so
+    // if we had 3 columns, each with 2 pages, and 1 row group, our schema values might look like
+    //
+    // 1, 1, 2, 2, 3, 3
+    //
+    // However, if we had more than one row group, the pattern would be
+    //
+    // 1, 1, 2, 2, 3, 3, 1, 1, 2, 2, 3, 3
+    // ^ row group 0     |
+    //                   ^ row group 1
+    //
+    // To use exclusive_scan_by_key, the ordering we actually want is
+    //
+    // 1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3
+    //
+    // We also need to preserve key-relative page ordering, so we need to use a stable sort.
+    _chunk_itm_data.page_keys  = rmm::device_uvector<int>(pages.size(), _stream);
+    _chunk_itm_data.page_index = rmm::device_uvector<int>(pages.size(), _stream);
+    auto& page_keys            = _chunk_itm_data.page_keys;
+    auto& page_index           = _chunk_itm_data.page_index;
+    {
+      thrust::transform(rmm::exec_policy(_stream),
+                        pages.device_ptr(),
+                        pages.device_ptr() + pages.size(),
+                        page_keys.begin(),
+                        get_page_schema{});
+
+      thrust::sequence(rmm::exec_policy(_stream), page_index.begin(), page_index.end());
+      thrust::stable_sort_by_key(rmm::exec_policy(_stream),
+                                 page_keys.begin(),
+                                 page_keys.end(),
+                                 page_index.begin(),
+                                 thrust::less<int>());
+    }
+
+    // retrieve pages back
+    pages.device_to_host(_stream, true);
+
+#if defined(PREPROCESS_DEBUG)
+    print_pages(pages, _stream);
+#endif
+  }
+
+  // compute splits if necessary. otherwise retun a single split representing
+  // the whole file.
+  _chunk_read_info = chunk_read_limit > 0
+                       ? compute_splits(pages, _chunk_itm_data, num_rows, chunk_read_limit, _stream)
+                       : std::vector<gpu::chunk_read_info>{{skip_rows, num_rows}};
+}
+
+void reader::impl::allocate_columns(size_t skip_rows, size_t num_rows, bool uses_custom_row_bounds)
+{
+  auto const& chunks = _file_itm_data.chunks;
+  auto& pages        = _file_itm_data.pages_info;
+
+  // Should not reach here if there is no page data.
+  CUDF_EXPECTS(pages.size() > 0, "There is no page to parse");
+
+  // computes:
+  // PageNestingInfo::batch_size for each level of nesting, for each page, taking row bounds into
+  // account. PageInfo::skipped_values, which tells us where to start decoding in the input to
+  // respect the user bounds. It is only necessary to do this second pass if uses_custom_row_bounds
+  // is set (if the user has specified artifical bounds).
+  if (uses_custom_row_bounds) {
+    gpu::ComputePageSizes(pages,
+                          chunks,
+                          skip_rows,
+                          num_rows,
+                          false,  // num_rows is already computed
+                          false,  // no need to compute string sizes
+                          _stream);
+#if defined(PREPROCESS_DEBUG)
+    print_pages(pages, _stream);
+#endif
+  }
+
+  // iterate over all input columns and allocate any associated output
+  // buffers if they are not part of a list hierarchy. mark down
+  // if we have any list columns that need further processing.
+  bool has_lists = false;
+  for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+    auto const& input_col  = _input_columns[idx];
+    size_t const max_depth = input_col.nesting_depth();
+
+    auto* cols = &_output_buffers;
+    for (size_t l_idx = 0; l_idx < max_depth; l_idx++) {
+      auto& out_buf = (*cols)[input_col.nesting[l_idx]];
+      cols          = &out_buf.children;
+
+      // if this has a list parent, we have to get column sizes from the
+      // data computed during gpu::ComputePageSizes
+      if (out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) {
+        has_lists = true;
+      }
+      // if we haven't already processed this column because it is part of a struct hierarchy
+      else if (out_buf.size == 0) {
+        // add 1 for the offset if this is a list column
+        out_buf.create(
+          out_buf.type.id() == type_id::LIST && l_idx < max_depth ? num_rows + 1 : num_rows,
+          _stream,
+          _mr);
+      }
+    }
+  }
+
+  // compute output column sizes by examining the pages of the -input- columns
+  if (has_lists) {
+    auto& page_keys  = _chunk_itm_data.page_keys;
+    auto& page_index = _chunk_itm_data.page_index;
+    for (size_t idx = 0; idx < _input_columns.size(); idx++) {
+      auto const& input_col = _input_columns[idx];
+      auto src_col_schema   = input_col.schema_idx;
+      size_t max_depth      = input_col.nesting_depth();
+
+      auto* cols = &_output_buffers;
+      for (size_t l_idx = 0; l_idx < input_col.nesting_depth(); l_idx++) {
+        auto& out_buf = (*cols)[input_col.nesting[l_idx]];
+        cols          = &out_buf.children;
+
+        // size iterator. indexes pages by sorted order
+        auto size_input = thrust::make_transform_iterator(
+          page_index.begin(),
+          get_page_nesting_size{src_col_schema, static_cast<size_type>(l_idx), pages.device_ptr()});
+
+        // if this buffer is part of a list hierarchy, we need to determine it's
+        // final size and allocate it here.
+        //
+        // for struct columns, higher levels of the output columns are shared between input
+        // columns. so don't compute any given level more than once.
+        if ((out_buf.user_data & PARQUET_COLUMN_BUFFER_FLAG_HAS_LIST_PARENT) && out_buf.size == 0) {
+          int size =
+            thrust::reduce(rmm::exec_policy(_stream), size_input, size_input + pages.size());
+
+          // if this is a list column add 1 for non-leaf levels for the terminating offset
+          if (out_buf.type.id() == type_id::LIST && l_idx < max_depth) { size++; }
+
+          // allocate
+          out_buf.create(size, _stream, _mr);
+        }
+
+        // for nested hierarchies, compute per-page start offset
+        if (input_col.has_repetition) {
+          thrust::exclusive_scan_by_key(
+            rmm::exec_policy(_stream),
+            page_keys.begin(),
+            page_keys.end(),
+            size_input,
+            start_offset_output_iterator{pages.device_ptr(),
+                                         page_index.begin(),
+                                         0,
+                                         static_cast<int>(src_col_schema),
+                                         static_cast<int>(l_idx)});
+        }
+      }
+    }
+  }
+}
+
+}  // namespace cudf::io::detail::parquet
diff --git a/cpp/src/io/parquet/writer_impl.cu b/cpp/src/io/parquet/writer_impl.cu
index 9514b053451..26b3f97616f 100644
--- a/cpp/src/io/parquet/writer_impl.cu
+++ b/cpp/src/io/parquet/writer_impl.cu
@@ -19,6 +19,7 @@
  * @brief cuDF-IO parquet writer class implementation
  */
 
+#include "parquet_gpu.cuh"
 #include "writer_impl.hpp"
 
 #include "compact_protocol_reader.hpp"
@@ -374,44 +375,53 @@ struct leaf_schema_fn {
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_D>, void> operator()()
   {
-    col_schema.type           = Type::INT32;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.type                                = Type::INT32;
+    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
+    col_schema.ts_scale                            = 24 * 60 * 60 * 1000;
+    col_schema.logical_type.isset.TIME             = true;
+    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_s>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
-    col_schema.ts_scale       = 1000;
+    col_schema.type                                = Type::INT32;
+    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
+    col_schema.ts_scale                            = 1000;
+    col_schema.logical_type.isset.TIME             = true;
+    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ms>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.converted_type = ConvertedType::TIME_MILLIS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.type                                = Type::INT32;
+    col_schema.converted_type                      = ConvertedType::TIME_MILLIS;
+    col_schema.stats_dtype                         = statistics_dtype::dtype_int32;
+    col_schema.logical_type.isset.TIME             = true;
+    col_schema.logical_type.TIME.unit.isset.MILLIS = true;
   }
 
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_us>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.converted_type = ConvertedType::TIME_MICROS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
+    col_schema.type                                = Type::INT64;
+    col_schema.converted_type                      = ConvertedType::TIME_MICROS;
+    col_schema.stats_dtype                         = statistics_dtype::dtype_int64;
+    col_schema.logical_type.isset.TIME             = true;
+    col_schema.logical_type.TIME.unit.isset.MICROS = true;
   }
 
   //  unsupported outside cudf for parquet 1.0.
   template <typename T>
   std::enable_if_t<std::is_same_v<T, cudf::duration_ns>, void> operator()()
   {
-    col_schema.type           = Type::INT64;
-    col_schema.converted_type = ConvertedType::TIME_MICROS;
-    col_schema.stats_dtype    = statistics_dtype::dtype_int64;
-    col_schema.ts_scale       = -1000;  // negative value indicates division by absolute value
+    col_schema.type                               = Type::INT64;
+    col_schema.stats_dtype                        = statistics_dtype::dtype_int64;
+    col_schema.logical_type.isset.TIME            = true;
+    col_schema.logical_type.TIME.unit.isset.NANOS = true;
   }
 
   template <typename T>
@@ -511,7 +521,7 @@ std::vector<schema_tree_node> construct_schema_tree(
         if (col->type().id() != type_id::LIST) { return false; }
         auto const child_col_type =
           col->children[lists_column_view::child_column_index]->type().id();
-        return child_col_type == type_id::INT8 or child_col_type == type_id::UINT8;
+        return child_col_type == type_id::UINT8;
       };
 
       // There is a special case for a list<int8> column with one byte column child. This column can
@@ -917,7 +927,7 @@ auto to_nvcomp_compression_type(Compression codec)
 auto page_alignment(Compression codec)
 {
   if (codec == Compression::UNCOMPRESSED or
-      not nvcomp::is_compression_enabled(to_nvcomp_compression_type(codec))) {
+      nvcomp::is_compression_disabled(to_nvcomp_compression_type(codec))) {
     return 1u;
   }
 
@@ -1162,19 +1172,22 @@ void writer::impl::encode_pages(hostdevice_2dvector<gpu::EncColumnChunk>& chunks
   gpu::EncodePages(batch_pages, comp_in, comp_out, comp_res, stream);
   switch (compression_) {
     case parquet::Compression::SNAPPY:
-      if (nvcomp::is_compression_enabled(nvcomp::compression_type::SNAPPY)) {
+      if (nvcomp::is_compression_disabled(nvcomp::compression_type::SNAPPY)) {
+        gpu_snap(comp_in, comp_out, comp_res, stream);
+      } else {
         nvcomp::batched_compress(
           nvcomp::compression_type::SNAPPY, comp_in, comp_out, comp_res, stream);
-      } else {
-        gpu_snap(comp_in, comp_out, comp_res, stream);
       }
       break;
-    case parquet::Compression::ZSTD:
-      if (nvcomp::is_compression_enabled(nvcomp::compression_type::ZSTD)) {
-        nvcomp::batched_compress(
-          nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
+    case parquet::Compression::ZSTD: {
+      if (auto const reason = nvcomp::is_compression_disabled(nvcomp::compression_type::ZSTD);
+          reason) {
+        CUDF_FAIL("Compression error: " + reason.value());
       }
+      nvcomp::batched_compress(nvcomp::compression_type::ZSTD, comp_in, comp_out, comp_res, stream);
+
       break;
+    }
     case parquet::Compression::UNCOMPRESSED: break;
     default: CUDF_FAIL("invalid compression type");
   }
@@ -1236,9 +1249,9 @@ size_t max_page_bytes(Compression compression, size_t max_page_size_bytes)
   if (compression == parquet::Compression::UNCOMPRESSED) { return max_page_size_bytes; }
 
   auto const ncomp_type   = to_nvcomp_compression_type(compression);
-  auto const nvcomp_limit = nvcomp::is_compression_enabled(ncomp_type)
-                              ? nvcomp::compress_max_allowed_chunk_size(ncomp_type)
-                              : std::nullopt;
+  auto const nvcomp_limit = nvcomp::is_compression_disabled(ncomp_type)
+                              ? std::nullopt
+                              : nvcomp::compress_max_allowed_chunk_size(ncomp_type);
 
   return std::min(nvcomp_limit.value_or(max_page_size_bytes), max_page_size_bytes);
 }
diff --git a/cpp/src/io/statistics/statistics_type_identification.cuh b/cpp/src/io/statistics/statistics_type_identification.cuh
index 10a7518aefa..9fc30c625aa 100644
--- a/cpp/src/io/statistics/statistics_type_identification.cuh
+++ b/cpp/src/io/statistics/statistics_type_identification.cuh
@@ -74,8 +74,7 @@ struct conversion_map<io_file_format::PARQUET, is_int96_timestamp::YES> {
 template <>
 struct conversion_map<io_file_format::PARQUET, is_int96_timestamp::NO> {
   using types = std::tuple<std::pair<cudf::timestamp_s, cudf::timestamp_ms>,
-                           std::pair<cudf::duration_s, cudf::duration_ms>,
-                           std::pair<cudf::duration_ns, cudf::duration_us>>;
+                           std::pair<cudf::duration_s, cudf::duration_ms>>;
 };
 
 /**
diff --git a/cpp/src/io/text/bgzip_data_chunk_source.cu b/cpp/src/io/text/bgzip_data_chunk_source.cu
index 7715c2ca7e1..3fa68cd8b0f 100644
--- a/cpp/src/io/text/bgzip_data_chunk_source.cu
+++ b/cpp/src/io/text/bgzip_data_chunk_source.cu
@@ -19,7 +19,9 @@
 #include "io/utilities/config_utils.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/pinned_allocator.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/detail/bgzip_utils.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 
@@ -29,14 +31,12 @@
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/zip_iterator.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 #include <thrust/transform.h>
 
 #include <fstream>
 #include <limits>
 
 namespace cudf::io::text {
-
 namespace {
 
 /**
@@ -64,71 +64,8 @@ struct bgzip_nvcomp_transform_functor {
 
 class bgzip_data_chunk_reader : public data_chunk_reader {
  private:
-  template <typename IntType>
-  static IntType read_int(char* data)
-  {
-    IntType result{};
-    // we assume little-endian
-    std::memcpy(&result, &data[0], sizeof(result));
-    return result;
-  }
-
-  struct bgzip_header {
-    int block_size;
-    int extra_length;
-    [[nodiscard]] int data_size() const { return block_size - extra_length - 20; }
-  };
-
-  bgzip_header read_header()
-  {
-    std::array<char, 12> buffer{};
-    _data_stream->read(buffer.data(), sizeof(buffer));
-    std::array<uint8_t, 4> const expected_header{{31, 139, 8, 4}};
-    CUDF_EXPECTS(
-      std::equal(
-        expected_header.begin(), expected_header.end(), reinterpret_cast<uint8_t*>(buffer.data())),
-      "malformed BGZIP header");
-    // we ignore the remaining bytes of the fixed header, since they don't matter to us
-    auto const extra_length = read_int<uint16_t>(&buffer[10]);
-    uint16_t extra_offset{};
-    // read all the extra subfields
-    while (extra_offset < extra_length) {
-      auto const remaining_size = extra_length - extra_offset;
-      CUDF_EXPECTS(remaining_size >= 4, "invalid extra field length");
-      // a subfield consists of 2 identifier bytes and a uint16 length
-      // 66/67 identifies a BGZIP block size field, we skip all other fields
-      _data_stream->read(buffer.data(), 4);
-      extra_offset += 4;
-      auto const subfield_size = read_int<uint16_t>(&buffer[2]);
-      if (buffer[0] == 66 && buffer[1] == 67) {
-        // the block size subfield contains a single uint16 value, which is block_size - 1
-        CUDF_EXPECTS(subfield_size == sizeof(uint16_t), "malformed BGZIP extra subfield");
-        _data_stream->read(buffer.data(), sizeof(uint16_t));
-        _data_stream->seekg(remaining_size - 6, std::ios_base::cur);
-        auto const block_size_minus_one = read_int<uint16_t>(&buffer[0]);
-        return {block_size_minus_one + 1, extra_length};
-      } else {
-        _data_stream->seekg(subfield_size, std::ios_base::cur);
-        extra_offset += subfield_size;
-      }
-    }
-    CUDF_FAIL("missing BGZIP size extra subfield");
-  }
-
-  struct bgzip_footer {
-    uint32_t decompressed_size;
-  };
-
-  bgzip_footer read_footer()
-  {
-    std::array<char, 8> buffer{};
-    _data_stream->read(buffer.data(), sizeof(buffer));
-    return {read_int<uint32_t>(&buffer[4])};
-  }
-
   template <typename T>
-  using pinned_host_vector =
-    thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>>;
+  using pinned_host_vector = thrust::host_vector<T, cudf::detail::pinned_allocator<T>>;
 
   template <typename T>
   static void copy_to_device(const pinned_host_vector<T>& host,
@@ -207,7 +144,13 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
         bgzip_nvcomp_transform_functor{reinterpret_cast<uint8_t const*>(d_compressed_blocks.data()),
                                        reinterpret_cast<uint8_t*>(d_decompressed_blocks.begin())});
       if (decompressed_size() > 0) {
-        if (cudf::io::detail::nvcomp_integration::is_all_enabled()) {
+        if (nvcomp::is_decompression_disabled(nvcomp::compression_type::DEFLATE)) {
+          gpuinflate(d_compressed_spans,
+                     d_decompressed_spans,
+                     d_decompression_results,
+                     gzip_header_included::NO,
+                     stream);
+        } else {
           cudf::io::nvcomp::batched_decompress(cudf::io::nvcomp::compression_type::DEFLATE,
                                                d_compressed_spans,
                                                d_decompressed_spans,
@@ -215,12 +158,6 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
                                                max_decompressed_size,
                                                decompressed_size(),
                                                stream);
-        } else {
-          gpuinflate(d_compressed_spans,
-                     d_decompressed_spans,
-                     d_decompression_results,
-                     gzip_header_included::NO,
-                     stream);
         }
       }
       is_decompressed = true;
@@ -232,7 +169,7 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       h_compressed_offsets.resize(1);
       h_decompressed_offsets.resize(1);
       // shrinking doesn't allocate/free, so we don't need to worry about streams
-      auto stream = cudf::default_stream_value;
+      auto stream = cudf::get_default_stream();
       d_compressed_blocks.resize(0, stream);
       d_decompressed_blocks.resize(0, stream);
       d_compressed_offsets.resize(0, stream);
@@ -258,13 +195,13 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       return available_decompressed_size - read_pos;
     }
 
-    void read_block(bgzip_header header, std::istream& stream)
+    void read_block(detail::bgzip::header header, std::istream& stream)
     {
       h_compressed_blocks.resize(h_compressed_blocks.size() + header.data_size());
       stream.read(h_compressed_blocks.data() + compressed_size(), header.data_size());
     }
 
-    void add_block_offsets(bgzip_header header, bgzip_footer footer)
+    void add_block_offsets(detail::bgzip::header header, detail::bgzip::footer footer)
     {
       max_decompressed_size =
         std::max<std::size_t>(footer.decompressed_size, max_decompressed_size);
@@ -294,9 +231,9 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
       // peek is necessary if we are already at the end, but didn't try to read another byte
       _data_stream->peek();
       if (_data_stream->eof() || _compressed_pos > _compressed_end) { break; }
-      auto header = read_header();
+      auto header = detail::bgzip::read_header(*_data_stream);
       _curr_blocks.read_block(header, *_data_stream);
-      auto footer = read_footer();
+      auto footer = detail::bgzip::read_footer(*_data_stream);
       _curr_blocks.add_block_offsets(header, footer);
       // for the last GZIP block, we restrict ourselves to the bytes up to _local_end
       // but only for the reader, not for decompression!
@@ -318,8 +255,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
                           uint64_t virtual_begin,
                           uint64_t virtual_end)
     : _data_stream(std::move(input_stream)),
-      _prev_blocks{cudf::default_stream_value},  // here we can use the default stream because
-      _curr_blocks{cudf::default_stream_value},  // we only initialize empty device_uvectors
+      _prev_blocks{cudf::get_default_stream()},  // here we can use the default stream because
+      _curr_blocks{cudf::get_default_stream()},  // we only initialize empty device_uvectors
       _local_end{virtual_end & 0xFFFFu},
       _compressed_pos{virtual_begin >> 16},
       _compressed_end{virtual_end >> 16}
@@ -333,8 +270,8 @@ class bgzip_data_chunk_reader : public data_chunk_reader {
     // seek to the beginning of the provided local offset
     auto const local_pos = virtual_begin & 0xFFFFu;
     if (local_pos > 0) {
-      CUDF_EXPECTS(_curr_blocks.h_compressed_offsets.size() > 1 &&
-                     local_pos < _curr_blocks.h_compressed_offsets[1],
+      CUDF_EXPECTS(_curr_blocks.h_decompressed_offsets.size() > 1 &&
+                     local_pos < _curr_blocks.h_decompressed_offsets[1],
                    "local part of virtual offset is out of bounds");
       _curr_blocks.consume_bytes(local_pos);
     }
diff --git a/cpp/src/io/text/bgzip_utils.cpp b/cpp/src/io/text/bgzip_utils.cpp
new file mode 100644
index 00000000000..dd08387a6b5
--- /dev/null
+++ b/cpp/src/io/text/bgzip_utils.cpp
@@ -0,0 +1,179 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <zlib.h>
+
+#include <cudf/io/text/detail/bgzip_utils.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <algorithm>
+#include <array>
+#include <fstream>
+#include <limits>
+
+namespace cudf::io::text::detail::bgzip {
+namespace {
+
+template <typename IntType>
+IntType read_int(char* data)
+{
+  IntType result{};
+  // we assume little-endian
+  std::memcpy(&result, &data[0], sizeof(result));
+  return result;
+}
+
+template <typename T>
+void write_int(std::ostream& output_stream, T val)
+{
+  std::array<char, sizeof(T)> bytes;
+  // we assume little-endian
+  std::memcpy(&bytes[0], &val, sizeof(T));
+  output_stream.write(bytes.data(), bytes.size());
+}
+
+}  // namespace
+
+std::array<char, 4> constexpr extra_blocklen_field_header{{66, 67, 2, 0}};
+
+header read_header(std::istream& input_stream)
+{
+  std::array<char, 12> buffer{};
+  input_stream.read(buffer.data(), sizeof(buffer));
+  std::array<uint8_t, 4> constexpr expected_header{{31, 139, 8, 4}};
+  CUDF_EXPECTS(
+    std::equal(
+      expected_header.begin(), expected_header.end(), reinterpret_cast<uint8_t*>(buffer.data())),
+    "malformed BGZIP header");
+  // we ignore the remaining bytes of the fixed header, since they don't matter to us
+  auto const extra_length = read_int<uint16_t>(&buffer[10]);
+  uint16_t extra_offset{};
+  // read all the extra subfields
+  while (extra_offset < extra_length) {
+    auto const remaining_size = extra_length - extra_offset;
+    CUDF_EXPECTS(remaining_size >= 4, "invalid extra field length");
+    // a subfield consists of 2 identifier bytes and a uint16 length
+    // 66/67 identifies a BGZIP block size field, we skip all other fields
+    input_stream.read(buffer.data(), 4);
+    extra_offset += 4;
+    auto const subfield_size = read_int<uint16_t>(&buffer[2]);
+    if (buffer[0] == extra_blocklen_field_header[0] &&
+        buffer[1] == extra_blocklen_field_header[1]) {
+      // the block size subfield contains a single uint16 value, which is block_size - 1
+      CUDF_EXPECTS(
+        buffer[2] == extra_blocklen_field_header[2] && buffer[3] == extra_blocklen_field_header[3],
+        "malformed BGZIP extra subfield");
+      input_stream.read(buffer.data(), sizeof(uint16_t));
+      input_stream.seekg(remaining_size - 6, std::ios_base::cur);
+      auto const block_size_minus_one = read_int<uint16_t>(&buffer[0]);
+      return {block_size_minus_one + 1, extra_length};
+    } else {
+      input_stream.seekg(subfield_size, std::ios_base::cur);
+      extra_offset += subfield_size;
+    }
+  }
+  CUDF_FAIL("missing BGZIP size extra subfield");
+}
+
+footer read_footer(std::istream& input_stream)
+{
+  std::array<char, 8> buffer{};
+  input_stream.read(buffer.data(), sizeof(buffer));
+  return {read_int<uint32_t>(&buffer[0]), read_int<uint32_t>(&buffer[4])};
+}
+
+void write_footer(std::ostream& output_stream, host_span<char const> data)
+{
+  // compute crc32 with zlib, this allows checking the generated files with external tools
+  write_int<uint32_t>(output_stream, crc32(0, (unsigned char*)data.data(), data.size()));
+  write_int<uint32_t>(output_stream, data.size());
+}
+
+void write_header(std::ostream& output_stream,
+                  uint16_t compressed_size,
+                  host_span<char const> pre_size_subfield,
+                  host_span<char const> post_size_subfield)
+{
+  std::array<uint8_t, 10> constexpr header_data{{
+    31,   // magic number
+    139,  // magic number
+    8,    // compression type: deflate
+    4,    // flags: extra header
+    0,    // mtime
+    0,    // mtime
+    0,    // mtime
+    0,    // mtime: irrelevant
+    4,    // xfl: irrelevant
+    3     // OS: irrelevant
+  }};
+  output_stream.write(reinterpret_cast<const char*>(header_data.data()), header_data.size());
+  auto const extra_size = pre_size_subfield.size() + extra_blocklen_field_header.size() +
+                          sizeof(uint16_t) + post_size_subfield.size();
+  auto const block_size =
+    header_data.size() + sizeof(uint16_t) + extra_size + compressed_size + 2 * sizeof(uint32_t);
+  write_int<uint16_t>(output_stream, extra_size);
+  output_stream.write(pre_size_subfield.data(), pre_size_subfield.size());
+  output_stream.write(extra_blocklen_field_header.data(), extra_blocklen_field_header.size());
+  CUDF_EXPECTS(block_size - 1 <= std::numeric_limits<uint16_t>::max(), "block size overflow");
+  write_int<uint16_t>(output_stream, block_size - 1);
+  output_stream.write(post_size_subfield.data(), post_size_subfield.size());
+}
+
+void write_uncompressed_block(std::ostream& output_stream,
+                              host_span<char const> data,
+                              host_span<char const> pre_size_subfields,
+                              host_span<char const> post_size_subfields)
+{
+  CUDF_EXPECTS(data.size() <= std::numeric_limits<uint16_t>::max(), "data size overflow");
+  write_header(output_stream, data.size() + 5, pre_size_subfields, post_size_subfields);
+  write_int<uint8_t>(output_stream, 1);
+  write_int<uint16_t>(output_stream, data.size());
+  write_int<uint16_t>(output_stream, ~static_cast<uint16_t>(data.size()));
+  output_stream.write(data.data(), data.size());
+  write_footer(output_stream, data);
+}
+
+void write_compressed_block(std::ostream& output_stream,
+                            host_span<char const> data,
+                            host_span<char const> pre_size_subfields,
+                            host_span<char const> post_size_subfields)
+{
+  CUDF_EXPECTS(data.size() <= std::numeric_limits<uint16_t>::max(), "data size overflow");
+  z_stream deflate_stream{};
+  // let's make sure we have enough space to store the data
+  std::vector<char> compressed_out(data.size() * 2 + 256);
+  deflate_stream.next_in   = reinterpret_cast<unsigned char*>(const_cast<char*>(data.data()));
+  deflate_stream.avail_in  = data.size();
+  deflate_stream.next_out  = reinterpret_cast<unsigned char*>(compressed_out.data());
+  deflate_stream.avail_out = compressed_out.size();
+  CUDF_EXPECTS(
+    deflateInit2(&deflate_stream,        // stream
+                 Z_DEFAULT_COMPRESSION,  // compression level
+                 Z_DEFLATED,             // method
+                 -15,  // log2 of window size (negative value means no ZLIB header/footer)
+                 9,    // mem level: best performance/most memory usage for compression
+                 Z_DEFAULT_STRATEGY  // strategy
+                 ) == Z_OK,
+    "deflateInit failed");
+  CUDF_EXPECTS(deflate(&deflate_stream, Z_FINISH) == Z_STREAM_END, "deflate failed");
+  CUDF_EXPECTS(deflateEnd(&deflate_stream) == Z_OK, "deflateEnd failed");
+  write_header(output_stream, deflate_stream.total_out, pre_size_subfields, post_size_subfields);
+  output_stream.write(compressed_out.data(), deflate_stream.total_out);
+  write_footer(output_stream, data);
+}
+
+}  // namespace cudf::io::text::detail::bgzip
diff --git a/cpp/src/io/text/data_chunk_source_factories.cpp b/cpp/src/io/text/data_chunk_source_factories.cpp
index 9a549951d66..c09e7be507f 100644
--- a/cpp/src/io/text/data_chunk_source_factories.cpp
+++ b/cpp/src/io/text/data_chunk_source_factories.cpp
@@ -17,12 +17,12 @@
 #include "io/text/device_data_chunks.hpp"
 
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/utilities/pinned_allocator.hpp>
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 
 #include <rmm/device_buffer.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 
 #include <fstream>
 
@@ -30,6 +30,86 @@ namespace cudf::io::text {
 
 namespace {
 
+/**
+ * @brief A reader which produces owning chunks of device memory which contain a copy of the data
+ * from an istream.
+ */
+class datasource_chunk_reader : public data_chunk_reader {
+  struct host_ticket {
+    cudaEvent_t event;
+    thrust::host_vector<char, cudf::detail::pinned_allocator<char>> buffer;
+  };
+
+  constexpr static int num_tickets = 2;
+
+ public:
+  datasource_chunk_reader(datasource* source) : _source(source)
+  {
+    // create an event to track the completion of the last device-to-host copy.
+    for (auto& ticket : _tickets) {
+      CUDF_CUDA_TRY(cudaEventCreate(&(ticket.event)));
+    }
+  }
+
+  ~datasource_chunk_reader() override
+  {
+    for (auto& ticket : _tickets) {
+      CUDF_CUDA_TRY(cudaEventDestroy(ticket.event));
+    }
+  }
+
+  void skip_bytes(std::size_t size) override
+  {
+    _offset += std::min(_source->size() - _offset, size);
+  };
+
+  std::unique_ptr<device_data_chunk> get_next_chunk(std::size_t read_size,
+                                                    rmm::cuda_stream_view stream) override
+  {
+    CUDF_FUNC_RANGE();
+
+    read_size = std::min(_source->size() - _offset, read_size);
+
+    // get a device buffer containing read data on the device.
+    auto chunk = rmm::device_uvector<char>(read_size, stream);
+
+    if (_source->supports_device_read() && _source->is_device_read_preferred(read_size)) {
+      _source->device_read_async(
+        _offset, read_size, reinterpret_cast<uint8_t*>(chunk.data()), stream);
+    } else {
+      auto& h_ticket = _tickets[_next_ticket_idx];
+
+      _next_ticket_idx = (_next_ticket_idx + 1) % num_tickets;
+
+      // synchronize on the last host-to-device copy, so we don't clobber the host buffer.
+      CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event));
+
+      // resize the host buffer as necessary to contain the requested number of bytes
+      if (h_ticket.buffer.size() < read_size) { h_ticket.buffer.resize(read_size); }
+
+      _source->host_read(_offset, read_size, reinterpret_cast<uint8_t*>(h_ticket.buffer.data()));
+
+      // copy the host-pinned data on to device
+      CUDF_CUDA_TRY(cudaMemcpyAsync(
+        chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyHostToDevice, stream.value()));
+
+      // record the host-to-device copy.
+      CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
+    }
+
+    _offset += read_size;
+
+    // return the device buffer so it can be processed.
+    return std::make_unique<device_uvector_data_chunk>(std::move(chunk));
+  }
+
+ private:
+  std::size_t _offset          = 0;
+  std::size_t _next_ticket_idx = 0;
+  std::array<host_ticket, num_tickets> _tickets{};
+  datasource* _source;
+};
+
 /**
  * @brief A reader which produces owning chunks of device memory which contain a copy of the data
  * from an istream.
@@ -37,12 +117,14 @@ namespace {
 class istream_data_chunk_reader : public data_chunk_reader {
   struct host_ticket {
     cudaEvent_t event;
-    thrust::host_vector<char, thrust::system::cuda::experimental::pinned_allocator<char>> buffer;
+    thrust::host_vector<char, cudf::detail::pinned_allocator<char>> buffer;
   };
 
+  constexpr static int num_tickets = 2;
+
  public:
   istream_data_chunk_reader(std::unique_ptr<std::istream> datastream)
-    : _datastream(std::move(datastream)), _tickets(2)
+    : _datastream(std::move(datastream))
   {
     // create an event to track the completion of the last device-to-host copy.
     for (auto& ticket : _tickets) {
@@ -66,7 +148,7 @@ class istream_data_chunk_reader : public data_chunk_reader {
 
     auto& h_ticket = _tickets[_next_ticket_idx];
 
-    _next_ticket_idx = (_next_ticket_idx + 1) % _tickets.size();
+    _next_ticket_idx = (_next_ticket_idx + 1) % num_tickets;
 
     // synchronize on the last host-to-device copy, so we don't clobber the host buffer.
     CUDF_CUDA_TRY(cudaEventSynchronize(h_ticket.event));
@@ -84,12 +166,8 @@ class istream_data_chunk_reader : public data_chunk_reader {
     auto chunk = rmm::device_uvector<char>(read_size, stream);
 
     // copy the host-pinned data on to device
-    CUDF_CUDA_TRY(cudaMemcpyAsync(  //
-      chunk.data(),
-      h_ticket.buffer.data(),
-      read_size,
-      cudaMemcpyHostToDevice,
-      stream.value()));
+    CUDF_CUDA_TRY(cudaMemcpyAsync(
+      chunk.data(), h_ticket.buffer.data(), read_size, cudaMemcpyHostToDevice, stream.value()));
 
     // record the host-to-device copy.
     CUDF_CUDA_TRY(cudaEventRecord(h_ticket.event, stream.value()));
@@ -100,8 +178,8 @@ class istream_data_chunk_reader : public data_chunk_reader {
 
  private:
   std::size_t _next_ticket_idx = 0;
+  std::array<host_ticket, num_tickets> _tickets{};
   std::unique_ptr<std::istream> _datastream;
-  std::vector<host_ticket> _tickets;
 };
 
 /**
@@ -180,6 +258,21 @@ class device_span_data_chunk_reader : public data_chunk_reader {
   uint64_t _position = 0;
 };
 
+/**
+ * @brief A datasource-based data chunk source which creates a datasource_chunk_reader.
+ */
+class datasource_chunk_source : public data_chunk_source {
+ public:
+  datasource_chunk_source(datasource& source) : _source(&source) {}
+  [[nodiscard]] std::unique_ptr<data_chunk_reader> create_reader() const override
+  {
+    return std::make_unique<datasource_chunk_reader>(_source);
+  }
+
+ private:
+  datasource* _source;
+};
+
 /**
  * @brief A file data source which creates an istream_data_chunk_reader.
  */
@@ -228,6 +321,11 @@ class device_span_data_chunk_source : public data_chunk_source {
 
 }  // namespace
 
+std::unique_ptr<data_chunk_source> make_source(datasource& data)
+{
+  return std::make_unique<datasource_chunk_source>(data);
+}
+
 std::unique_ptr<data_chunk_source> make_source(host_span<const char> data)
 {
   return std::make_unique<host_span_data_chunk_source>(data);
diff --git a/cpp/src/io/text/multibyte_split.cu b/cpp/src/io/text/multibyte_split.cu
index 133c5fe9826..1177be6b63f 100644
--- a/cpp/src/io/text/multibyte_split.cu
+++ b/cpp/src/io/text/multibyte_split.cu
@@ -14,24 +14,23 @@
  * limitations under the License.
  */
 
-// Can be removed once we use Thrust 1.16+
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wpragmas"
-#pragma GCC diagnostic ignored "-Wsizeof-array-div"
+#include <io/utilities/output_builder.cuh>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
+#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/utilities/integer_utils.hpp>
 #include <cudf/io/text/byte_range_info.hpp>
 #include <cudf/io/text/data_chunk_source.hpp>
 #include <cudf/io/text/detail/multistate.hpp>
 #include <cudf/io/text/detail/tile_state.hpp>
+#include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 
-#include <limits>
 #include <rmm/cuda_stream_pool.hpp>
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/exec_policy.hpp>
@@ -39,57 +38,19 @@
 #include <rmm/mr/device/per_device_resource.hpp>
 
 #include <thrust/copy.h>
+#include <thrust/find.h>
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/transform.h>
 
 #include <cub/block/block_load.cuh>
 #include <cub/block/block_scan.cuh>
 
-#pragma GCC diagnostic pop
-
+#include <cstdint>
+#include <limits>
 #include <memory>
 #include <numeric>
 #include <optional>
 
-namespace cudf {
-
-/**
- * @brief A device span consisting of two separate device_spans acting as if they were part of a
- * single span. The first head.size() entries are served from the first span, the remaining
- * tail.size() entries are served from the second span.
- *
- * @tparam T The type of elements in the span.
- */
-template <typename T>
-class split_device_span {
- public:
-  explicit constexpr split_device_span(device_span<T> head, device_span<T> tail = {})
-    : _head{head}, _tail{tail}
-  {
-  }
-
-  [[nodiscard]] constexpr T& operator[](size_type i)
-  {
-    return i < _head.size() ? _head[i] : _tail[i - _head.size()];
-  }
-
-  [[nodiscard]] constexpr const T& operator[](size_type i) const
-  {
-    return i < _head.size() ? _head[i] : _tail[i - _head.size()];
-  }
-
-  [[nodiscard]] constexpr size_type size() const { return _head.size() + _tail.size(); }
-
-  [[nodiscard]] constexpr device_span<T> head() const { return _head; }
-
-  [[nodiscard]] constexpr device_span<T> tail() const { return _tail; }
-
- private:
-  device_span<T> _head;
-  device_span<T> _tail;
-};
-
-}  // namespace cudf
-
 namespace {
 
 using cudf::io::text::detail::multistate;
@@ -160,6 +121,10 @@ struct PatternScan {
   }
 };
 
+// type aliases to distinguish between row offsets and character offsets
+using output_offset = int64_t;
+using byte_offset   = int64_t;
+
 // multibyte_split works by splitting up inputs in to 32 inputs (bytes) per thread, and transforming
 // them in to data structures called "multistates". these multistates are created by searching a
 // trie, but instead of a tradition trie where the search begins at a single node at the beginning,
@@ -170,35 +135,11 @@ struct PatternScan {
 // it begins in. From there, each thread can then take deterministic action. In this case, the
 // deterministic action is counting and outputting delimiter offsets when a delimiter is found.
 
-// This struct provides output offsets that are only incremented until a cutoff point.
-struct cutoff_offset {
-  // magnitude stores the offset, sign bit stores whether we are past the cutoff
-  int64_t value = 0;
-
-  constexpr cutoff_offset() = default;
-
-  constexpr cutoff_offset(int64_t offset, bool is_past_cutoff)
-    : value{is_past_cutoff ? -offset : offset}
-  {
-  }
-
-  [[nodiscard]] constexpr int64_t offset() const { return value < 0 ? -value : value; }
-
-  [[nodiscard]] constexpr bool is_past_end() { return value < 0; }
-
-  friend constexpr cutoff_offset operator+(cutoff_offset lhs, cutoff_offset rhs)
-  {
-    auto const past_end = lhs.is_past_end() or rhs.is_past_end();
-    auto const offset   = lhs.offset() + (lhs.is_past_end() ? 0 : rhs.offset());
-    return cutoff_offset{offset, past_end};
-  }
-};
-
 __global__ void multibyte_split_init_kernel(
   cudf::size_type base_tile_idx,
   cudf::size_type num_tiles,
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<cutoff_offset> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_state_view<output_offset> tile_output_offsets,
   cudf::io::text::detail::scan_tile_status status =
     cudf::io::text::detail::scan_tile_status::invalid)
 {
@@ -212,9 +153,9 @@ __global__ void multibyte_split_init_kernel(
 
 __global__ void multibyte_split_seed_kernel(
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<cutoff_offset> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_state_view<output_offset> tile_output_offsets,
   multistate tile_multistate_seed,
-  cutoff_offset tile_output_offset)
+  output_offset tile_output_offset)
 {
   auto const thread_idx = blockIdx.x * blockDim.x + threadIdx.x;
   if (thread_idx == 0) {
@@ -225,19 +166,18 @@ __global__ void multibyte_split_seed_kernel(
 
 __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
   cudf::size_type base_tile_idx,
-  int64_t base_input_offset,
-  int64_t base_offset_offset,
+  byte_offset base_input_offset,
+  output_offset base_output_offset,
   cudf::io::text::detail::scan_tile_state_view<multistate> tile_multistates,
-  cudf::io::text::detail::scan_tile_state_view<cutoff_offset> tile_output_offsets,
+  cudf::io::text::detail::scan_tile_state_view<output_offset> tile_output_offsets,
   cudf::device_span<char const> delim,
   cudf::device_span<char const> chunk_input_chars,
-  int64_t byte_range_end,
-  cudf::split_device_span<int64_t> output_offsets)
+  cudf::split_device_span<byte_offset> row_offsets)
 {
   using InputLoad =
     cub::BlockLoad<char, THREADS_PER_TILE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
-  using OffsetScan         = cub::BlockScan<cutoff_offset, THREADS_PER_TILE>;
-  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<cutoff_offset>;
+  using OffsetScan         = cub::BlockScan<output_offset, THREADS_PER_TILE>;
+  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<output_offset>;
 
   __shared__ union {
     typename InputLoad::TempStorage input_load;
@@ -269,17 +209,15 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
 
   // STEP 3: Flag matches
 
-  cutoff_offset thread_offset;
+  output_offset thread_offset{};
   uint32_t thread_match_mask[(ITEMS_PER_THREAD + 31) / 32]{};
 
   for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) {
-    thread_multistate        = transition(thread_chars[i], thread_multistate, delim);
-    auto const thread_state  = thread_multistate.max_tail();
-    auto const is_match      = i < thread_input_size and thread_state == delim.size();
-    auto const match_end     = base_input_offset + thread_input_offset + i + 1;
-    auto const is_past_range = match_end >= byte_range_end;
+    thread_multistate       = transition(thread_chars[i], thread_multistate, delim);
+    auto const thread_state = thread_multistate.max_tail();
+    auto const is_match     = i < thread_input_size and thread_state == delim.size();
     thread_match_mask[i / 32] |= uint32_t{is_match} << (i % 32);
-    thread_offset = thread_offset + cutoff_offset{is_match, is_past_range};
+    thread_offset += output_offset{is_match};
   }
 
   // STEP 4: Scan flags to determine absolute thread output offset
@@ -293,29 +231,27 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void multibyte_split_kernel(
 
   for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) {
     auto const is_match = (thread_match_mask[i / 32] >> (i % 32)) & 1u;
-    if (is_match && !thread_offset.is_past_end()) {
-      auto const match_end     = base_input_offset + thread_input_offset + i + 1;
-      auto const is_past_range = match_end >= byte_range_end;
-      output_offsets[thread_offset.offset() - base_offset_offset] = match_end;
-      thread_offset = thread_offset + cutoff_offset{true, is_past_range};
+    if (is_match) {
+      auto const match_end = base_input_offset + thread_input_offset + i + 1;
+      row_offsets[thread_offset - base_output_offset] = match_end;
+      thread_offset++;
     }
   }
 }
 
 __global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
   cudf::size_type base_tile_idx,
-  int64_t base_input_offset,
-  int64_t base_offset_offset,
-  cudf::io::text::detail::scan_tile_state_view<cutoff_offset> tile_output_offsets,
+  byte_offset base_input_offset,
+  output_offset base_output_offset,
+  cudf::io::text::detail::scan_tile_state_view<output_offset> tile_output_offsets,
   char delim,
   cudf::device_span<char const> chunk_input_chars,
-  int64_t byte_range_end,
-  cudf::split_device_span<int64_t> output_offsets)
+  cudf::split_device_span<byte_offset> row_offsets)
 {
   using InputLoad =
     cub::BlockLoad<char, THREADS_PER_TILE, ITEMS_PER_THREAD, cub::BLOCK_LOAD_WARP_TRANSPOSE>;
-  using OffsetScan         = cub::BlockScan<cutoff_offset, THREADS_PER_TILE>;
-  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<cutoff_offset>;
+  using OffsetScan         = cub::BlockScan<output_offset, THREADS_PER_TILE>;
+  using OffsetScanCallback = cudf::io::text::detail::scan_tile_state_callback<output_offset>;
 
   __shared__ union {
     typename InputLoad::TempStorage input_load;
@@ -338,15 +274,13 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
 
   // STEP 2: Flag matches
 
-  cutoff_offset thread_offset;
+  output_offset thread_offset{};
   uint32_t thread_match_mask[(ITEMS_PER_THREAD + 31) / 32]{};
 
   for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) {
-    auto const is_match      = i < thread_input_size and thread_chars[i] == delim;
-    auto const match_end     = base_input_offset + thread_input_offset + i + 1;
-    auto const is_past_range = match_end >= byte_range_end;
+    auto const is_match = i < thread_input_size and thread_chars[i] == delim;
     thread_match_mask[i / 32] |= uint32_t{is_match} << (i % 32);
-    thread_offset = thread_offset + cutoff_offset{is_match, is_past_range};
+    thread_offset += output_offset{is_match};
   }
 
   // STEP 3: Scan flags to determine absolute thread output offset
@@ -360,11 +294,10 @@ __global__ __launch_bounds__(THREADS_PER_TILE) void byte_split_kernel(
 
   for (int32_t i = 0; i < ITEMS_PER_THREAD; i++) {
     auto const is_match = (thread_match_mask[i / 32] >> (i % 32)) & 1u;
-    if (is_match && !thread_offset.is_past_end()) {
-      auto const match_end     = base_input_offset + thread_input_offset + i + 1;
-      auto const is_past_range = match_end >= byte_range_end;
-      output_offsets[thread_offset.offset() - base_offset_offset] = match_end;
-      thread_offset = thread_offset + cutoff_offset{true, is_past_range};
+    if (is_match) {
+      auto const match_end = base_input_offset + thread_input_offset + i + 1;
+      row_offsets[thread_offset - base_output_offset] = match_end;
+      thread_offset++;
     }
   }
 }
@@ -407,173 +340,10 @@ std::vector<rmm::cuda_stream_view> get_streams(int32_t count, rmm::cuda_stream_p
   return streams;
 }
 
-/**
- * @brief A chunked storage class that provides preallocated memory for algorithms with known
- * worst-case output size. It provides functionality to retrieve the next chunk to write to, for
- * reporting how much memory was actually written and for gathering all previously written outputs
- * into a single contiguous vector.
- *
- * @tparam T The output element type.
- */
-template <typename T>
-class output_builder {
- public:
-  using size_type = typename rmm::device_uvector<T>::size_type;
-
-  /**
-   * @brief Initializes an output builder with given worst-case output size and stream.
-   *
-   * @param max_write_size the maximum number of elements that will be written into a
-   *                       split_device_span returned from `next_output`.
-   * @param stream the stream used to allocate the first chunk of memory.
-   * @param mr optional, the memory resource to use for allocation.
-   */
-  output_builder(size_type max_write_size,
-                 size_type max_growth,
-                 rmm::cuda_stream_view stream,
-                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
-    : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth}
-  {
-    CUDF_EXPECTS(max_write_size > 0, "Internal error");
-    _chunks.emplace_back(0, stream, mr);
-    _chunks.back().reserve(max_write_size * 2, stream);
-  }
-
-  output_builder(output_builder&&)      = delete;
-  output_builder(const output_builder&) = delete;
-  output_builder& operator=(output_builder&&) = delete;
-  output_builder& operator=(const output_builder&) = delete;
-
-  /**
-   * @brief Returns the next free chunk of `max_write_size` elements from the underlying storage.
-   * Must be followed by a call to `advance_output` after the memory has been written to.
-   *
-   * @param stream The stream to allocate a new chunk of memory with, if necessary.
-   *               This should be the stream that will write to the `split_device_span`.
-   * @return A `split_device_span` starting directly after the last output and providing at least
-   *         `max_write_size` entries of storage.
-   */
-  [[nodiscard]] split_device_span<T> next_output(rmm::cuda_stream_view stream)
-  {
-    auto head_it   = _chunks.end() - (_chunks.size() > 1 and _chunks.back().is_empty() ? 2 : 1);
-    auto head_span = get_free_span(*head_it);
-    if (head_span.size() >= _max_write_size) { return split_device_span<T>{head_span}; }
-    if (head_it == _chunks.end() - 1) {
-      // insert a new vector of double size
-      auto const next_chunk_size =
-        std::min(_max_growth * _max_write_size, 2 * _chunks.back().capacity());
-      _chunks.emplace_back(0, stream, _chunks.back().memory_resource());
-      _chunks.back().reserve(next_chunk_size, stream);
-    }
-    auto tail_span = get_free_span(_chunks.back());
-    CUDF_EXPECTS(head_span.size() + tail_span.size() >= _max_write_size, "Internal error");
-    return split_device_span<T>{head_span, tail_span};
-  }
-
-  /**
-   * @brief Advances the output sizes after a `split_device_span` returned from `next_output` was
-   *        written to.
-   *
-   * @param actual_size The number of elements that were written to the result of the previous
-   *                    `next_output` call.
-   */
-  void advance_output(size_type actual_size)
-  {
-    CUDF_EXPECTS(actual_size <= _max_write_size, "Internal error");
-    if (_chunks.size() < 2) {
-      auto const new_size = _chunks.back().size() + actual_size;
-      inplace_resize(_chunks.back(), new_size);
-    } else {
-      auto& tail              = _chunks.back();
-      auto& prev              = _chunks.rbegin()[1];
-      auto const prev_advance = std::min(actual_size, prev.capacity() - prev.size());
-      auto const tail_advance = actual_size - prev_advance;
-      inplace_resize(prev, prev.size() + prev_advance);
-      inplace_resize(tail, tail.size() + tail_advance);
-    }
-    _size += actual_size;
-  }
-
-  /**
-   * @brief Returns the first element that was written to the output.
-   *        Requires a previous call to `next_output` and `advance_output` and `size() > 0`.
-   * @param stream The stream used to access the element.
-   * @return The first element that was written to the output.
-   */
-  [[nodiscard]] T front_element(rmm::cuda_stream_view stream) const
-  {
-    return _chunks.front().front_element(stream);
-  }
-
-  /**
-   * @brief Returns the last element that was written to the output.
-   *        Requires a previous call to `next_output` and `advance_output` and `size() > 0`.
-   * @param stream The stream used to access the element.
-   * @return The last element that was written to the output.
-   */
-  [[nodiscard]] T back_element(rmm::cuda_stream_view stream) const
-  {
-    auto const& last_nonempty_chunk =
-      _chunks.size() > 1 and _chunks.back().is_empty() ? _chunks.rbegin()[1] : _chunks.back();
-    return last_nonempty_chunk.back_element(stream);
-  }
-
-  [[nodiscard]] size_type size() const { return _size; }
-
-  /**
-   * @brief Gathers all previously written outputs into a single contiguous vector.
-   *
-   * @param stream The stream used to allocate and gather the output vector. All previous write
-   *               operations to the output buffer must have finished or happened on this stream.
-   * @param mr The memory resource used to allocate the output vector.
-   * @return The output vector.
-   */
-  rmm::device_uvector<T> gather(rmm::cuda_stream_view stream,
-                                rmm::mr::device_memory_resource* mr) const
-  {
-    rmm::device_uvector<T> output{size(), stream, mr};
-    auto output_it = output.begin();
-    for (auto const& chunk : _chunks) {
-      output_it = thrust::copy(
-        rmm::exec_policy_nosync(stream), chunk.begin(), chunk.begin() + chunk.size(), output_it);
-    }
-    return output;
-  }
-
- private:
-  /**
-   * @brief Resizes a vector without reallocating
-   *
-   * @param vector The vector
-   * @param new_size The new size. Must be smaller than the vector's capacity
-   */
-  static void inplace_resize(rmm::device_uvector<T>& vector, size_type new_size)
-  {
-    CUDF_EXPECTS(new_size <= vector.capacity(), "Internal error");
-    vector.resize(new_size, rmm::cuda_stream_view{});
-  }
-
-  /**
-   * @brief Returns the span consisting of all currently unused elements in the vector
-   * (`i >= size() and i < capacity()`).
-   *
-   * @param vector The vector.
-   * @return The span of unused elements.
-   */
-  static device_span<T> get_free_span(rmm::device_uvector<T>& vector)
-  {
-    return device_span<T>{vector.data() + vector.size(), vector.capacity() - vector.size()};
-  }
-
-  size_type _size;
-  size_type _max_write_size;
-  size_type _max_growth;
-  std::vector<rmm::device_uvector<T>> _chunks;
-};
-
 std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
                                               std::string const& delimiter,
                                               byte_range_info byte_range,
+                                              bool strip_delimiters,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr,
                                               rmm::cuda_stream_pool& stream_pool)
@@ -611,7 +381,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   // best when at least 32 more than max possible concurrent tiles, due to rolling `invalid`s
   auto num_tile_states  = std::max(32, TILES_PER_CHUNK * concurrency + 32);
   auto tile_multistates = scan_tile_state<multistate>(num_tile_states, stream);
-  auto tile_offsets     = scan_tile_state<cutoff_offset>(num_tile_states, stream);
+  auto tile_offsets     = scan_tile_state<output_offset>(num_tile_states, stream);
 
   multibyte_split_init_kernel<<<TILES_PER_CHUNK,
                                 THREADS_PER_TILE,
@@ -633,15 +403,15 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
     tile_multistates,
     tile_offsets,
     multistate_seed,
-    {});
+    0);
 
   auto reader               = source.create_reader();
-  auto chunk_offset         = std::max<int64_t>(0, byte_range.offset() - delimiter.size());
+  auto chunk_offset         = std::max<byte_offset>(0, byte_range.offset() - delimiter.size());
   auto const byte_range_end = byte_range.offset() + byte_range.size();
   reader->skip_bytes(chunk_offset);
   // amortize output chunk allocations over 8 worst-case outputs. This limits the overallocation
   constexpr auto max_growth = 8;
-  output_builder<int64_t> offset_storage(ITEMS_PER_CHUNK, max_growth, stream);
+  output_builder<byte_offset> row_offset_storage(ITEMS_PER_CHUNK, max_growth, stream);
   output_builder<char> char_storage(ITEMS_PER_CHUNK, max_growth, stream);
 
   fork_stream(streams, stream);
@@ -653,22 +423,23 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
   auto& scan_stream     = streams[1];
   auto chunk            = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
   int64_t base_tile_idx = 0;
-  std::optional<int64_t> first_offset;
-  std::optional<int64_t> last_offset;
-  if (byte_range.offset() == 0) { first_offset = 0; }
+  std::optional<byte_offset> first_row_offset;
+  std::optional<byte_offset> last_row_offset;
+  bool found_last_offset = false;
+  if (byte_range.offset() == 0) { first_row_offset = 0; }
   std::swap(read_stream, scan_stream);
 
   while (chunk->size() > 0) {
     // if we found the last delimiter, or didn't find delimiters inside the byte range at all: abort
-    if (last_offset.has_value() or
-        (not first_offset.has_value() and chunk_offset >= byte_range_end)) {
+    if (last_row_offset.has_value() or
+        (not first_row_offset.has_value() and chunk_offset >= byte_range_end)) {
       break;
     }
 
     auto tiles_in_launch =
       cudf::util::div_rounding_up_safe(chunk->size(), static_cast<std::size_t>(ITEMS_PER_TILE));
 
-    auto offset_output = offset_storage.next_output(scan_stream);
+    auto row_offsets = row_offset_storage.next_output(scan_stream);
 
     // reset the next chunk of tile state
     multibyte_split_init_kernel<<<tiles_in_launch,
@@ -690,12 +461,11 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                           scan_stream.value()>>>(  //
         base_tile_idx,
         chunk_offset,
-        offset_storage.size(),
+        row_offset_storage.size(),
         tile_offsets,
         delimiter[0],
         *chunk,
-        byte_range_end,
-        offset_output);
+        row_offsets);
     } else {
       multibyte_split_kernel<<<tiles_in_launch,
                                THREADS_PER_TILE,
@@ -703,43 +473,62 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                scan_stream.value()>>>(  //
         base_tile_idx,
         chunk_offset,
-        offset_storage.size(),
+        row_offset_storage.size(),
         tile_multistates,
         tile_offsets,
         {device_delim.data(), static_cast<std::size_t>(device_delim.size())},
         *chunk,
-        byte_range_end,
-        offset_output);
+        row_offsets);
     }
 
     // load the next chunk
     auto next_chunk = reader->get_next_chunk(ITEMS_PER_CHUNK, read_stream);
     // while that is running, determine how many offsets we output (synchronizes)
-    auto next_tile_offset =
-      tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream);
-    offset_storage.advance_output(next_tile_offset.offset() - offset_storage.size());
+    auto const new_offsets = [&] {
+      auto const new_offsets_unclamped =
+        tile_offsets.get_inclusive_prefix(base_tile_idx + tiles_in_launch - 1, scan_stream) -
+        static_cast<output_offset>(row_offset_storage.size());
+      // if we are not in the last chunk, we can use all offsets
+      if (chunk_offset + static_cast<output_offset>(chunk->size()) < byte_range_end) {
+        return new_offsets_unclamped;
+      }
+      // if we are in the last chunk, we need to find the first out-of-bounds offset
+      auto const it = thrust::make_counting_iterator(output_offset{});
+      auto const end_loc =
+        *thrust::find_if(rmm::exec_policy_nosync(scan_stream),
+                         it,
+                         it + new_offsets_unclamped,
+                         [row_offsets, byte_range_end] __device__(output_offset i) {
+                           return row_offsets[i] >= byte_range_end;
+                         });
+      // if we had no out-of-bounds offset, we copy all offsets
+      if (end_loc == new_offsets_unclamped) { return end_loc; }
+      // otherwise we copy only up to (including) the first out-of-bounds delimiter
+      found_last_offset = true;
+      return end_loc + 1;
+    }();
+    row_offset_storage.advance_output(new_offsets, scan_stream);
     // determine if we found the first or last field offset for the byte range
-    if (next_tile_offset.offset() > 0 and not first_offset) {
-      first_offset = offset_storage.front_element(scan_stream);
+    if (new_offsets > 0 and not first_row_offset) {
+      first_row_offset = row_offset_storage.front_element(scan_stream);
     }
-    if (next_tile_offset.is_past_end()) { last_offset = offset_storage.back_element(scan_stream); }
+    if (found_last_offset) { last_row_offset = row_offset_storage.back_element(scan_stream); }
     // copy over the characters we need, if we already encountered the first field delimiter
-    if (first_offset.has_value()) {
-      auto const begin    = chunk->data() + std::max<int64_t>(0, *first_offset - chunk_offset);
-      auto const sentinel = last_offset.value_or(std::numeric_limits<int64_t>::max());
-      auto const end = chunk->data() + std::min<int64_t>(sentinel - chunk_offset, chunk->size());
+    if (first_row_offset.has_value()) {
+      auto const begin = chunk->data() + std::max<byte_offset>(0, *first_row_offset - chunk_offset);
+      auto const sentinel = last_row_offset.value_or(std::numeric_limits<byte_offset>::max());
+      auto const end =
+        chunk->data() + std::min<byte_offset>(sentinel - chunk_offset, chunk->size());
       auto const output_size = end - begin;
       auto char_output       = char_storage.next_output(scan_stream);
-      auto const split       = begin + std::min<int64_t>(output_size, char_output.head().size());
-      thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, split, char_output.head().begin());
-      thrust::copy(rmm::exec_policy_nosync(scan_stream), split, end, char_output.tail().begin());
-      char_storage.advance_output(output_size);
+      thrust::copy(rmm::exec_policy_nosync(scan_stream), begin, end, char_output.begin());
+      char_storage.advance_output(output_size, scan_stream);
     }
 
     cudaEventRecord(last_launch_event, scan_stream.value());
 
     std::swap(read_stream, scan_stream);
-    base_tile_idx += TILES_PER_CHUNK;
+    base_tile_idx += tiles_in_launch;
     chunk_offset += chunk->size();
     chunk = std::move(next_chunk);
   }
@@ -750,30 +539,54 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
 
   // if the input was empty, we didn't find a delimiter at all,
   // or the first delimiter was also the last: empty output
-  if (chunk_offset == 0 or not first_offset.has_value() or first_offset == last_offset) {
+  if (chunk_offset == 0 or not first_row_offset.has_value() or
+      first_row_offset == last_row_offset) {
     return make_empty_column(type_id::STRING);
   }
 
   auto chars          = char_storage.gather(stream, mr);
-  auto global_offsets = offset_storage.gather(stream, mr);
-
-  bool const insert_begin = *first_offset == 0;
-  bool const insert_end   = not last_offset.has_value() or last_offset == chunk_offset;
+  auto global_offsets = row_offset_storage.gather(stream, mr);
+
+  // insert an offset at the beginning if we started at the beginning of the input
+  bool const insert_begin = first_row_offset.value_or(0) == 0;
+  // insert an offset at the end if we have not terminated the last row
+  bool const insert_end =
+    not(last_row_offset.has_value() or
+        (global_offsets.size() > 0 and global_offsets.back_element(stream) == chunk_offset));
   rmm::device_uvector<int32_t> offsets{
     global_offsets.size() + insert_begin + insert_end, stream, mr};
   if (insert_begin) { offsets.set_element_to_zero_async(0, stream); }
-  if (insert_end) { offsets.set_element(offsets.size() - 1, chunk_offset - *first_offset, stream); }
+  if (insert_end) {
+    offsets.set_element(offsets.size() - 1, chunk_offset - *first_row_offset, stream);
+  }
   thrust::transform(rmm::exec_policy(stream),
                     global_offsets.begin(),
                     global_offsets.end(),
                     offsets.begin() + insert_begin,
-                    [baseline = *first_offset] __device__(int64_t global_offset) {
+                    [baseline = *first_row_offset] __device__(byte_offset global_offset) {
                       return static_cast<int32_t>(global_offset - baseline);
                     });
-
   auto string_count = offsets.size() - 1;
-
-  return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars));
+  if (strip_delimiters) {
+    auto it = cudf::detail::make_counting_transform_iterator(
+      0,
+      [ofs        = offsets.data(),
+       chars      = chars.data(),
+       delim_size = static_cast<size_type>(delimiter.size()),
+       last_row   = static_cast<size_type>(string_count) - 1,
+       insert_end] __device__(size_type row) {
+        auto const begin = ofs[row];
+        auto const len   = ofs[row + 1] - begin;
+        if (row == last_row && insert_end) {
+          return thrust::make_pair(chars + begin, len);
+        } else {
+          return thrust::make_pair(chars + begin, std::max<size_type>(0, len - delim_size));
+        };
+      });
+    return cudf::strings::detail::make_strings_column(it, it + string_count, stream, mr);
+  } else {
+    return cudf::make_strings_column(string_count, std::move(offsets), std::move(chars));
+  }
 }
 
 }  // namespace detail
@@ -783,11 +596,20 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               std::optional<byte_range_info> byte_range,
                                               rmm::mr::device_memory_resource* mr)
 {
-  auto stream      = cudf::default_stream_value;
+  return multibyte_split(
+    source, delimiter, parse_options{byte_range.value_or(create_byte_range_info_max())}, mr);
+}
+
+std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source const& source,
+                                              std::string const& delimiter,
+                                              parse_options options,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  auto stream      = cudf::get_default_stream();
   auto stream_pool = rmm::cuda_stream_pool(2);
 
   auto result = detail::multibyte_split(
-    source, delimiter, byte_range.value_or(create_byte_range_info_max()), stream, mr, stream_pool);
+    source, delimiter, options.byte_range, options.strip_delimiters, stream, mr, stream_pool);
 
   return result;
 }
@@ -796,7 +618,7 @@ std::unique_ptr<cudf::column> multibyte_split(cudf::io::text::data_chunk_source
                                               std::string const& delimiter,
                                               rmm::mr::device_memory_resource* mr)
 {
-  return multibyte_split(source, delimiter, std::nullopt, mr);
+  return multibyte_split(source, delimiter, parse_options{}, mr);
 }
 
 }  // namespace text
diff --git a/cpp/src/io/utilities/column_buffer.cpp b/cpp/src/io/utilities/column_buffer.cpp
index e2d209a7c0a..89ba5c598e8 100644
--- a/cpp/src/io/utilities/column_buffer.cpp
+++ b/cpp/src/io/utilities/column_buffer.cpp
@@ -22,6 +22,7 @@
 #include "column_buffer.hpp"
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/types.hpp>
 
 namespace cudf {
 namespace io {
@@ -54,6 +55,33 @@ void column_buffer::create(size_type _size,
   }
 }
 
+namespace {
+
+/**
+ * @brief Recursively copy `name` and `user_data` fields of one buffer to another.
+ *
+ * @param buff The old output buffer
+ * @param new_buff The new output buffer
+ */
+void copy_buffer_data(column_buffer const& buff, column_buffer& new_buff)
+{
+  new_buff.name      = buff.name;
+  new_buff.user_data = buff.user_data;
+  for (auto const& child : buff.children) {
+    auto& new_child = new_buff.children.emplace_back(column_buffer(child.type, child.is_nullable));
+    copy_buffer_data(child, new_child);
+  }
+}
+
+}  // namespace
+
+column_buffer column_buffer::empty_like(column_buffer const& input)
+{
+  auto new_buff = column_buffer(input.type, input.is_nullable);
+  copy_buffer_data(input, new_buff);
+  return new_buff;
+}
+
 /**
  * @copydoc cudf::io::detail::make_column
  */
@@ -78,7 +106,19 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
         // convert to binary
         auto const string_col = make_strings_column(*buffer._strings, stream, mr);
         auto const num_rows   = string_col->size();
-        auto col_contest      = string_col->release();
+        auto col_content      = string_col->release();
+
+        // convert to uint8 column, strings are currently stores as int8
+        auto contents =
+          col_content.children[strings_column_view::chars_column_index].release()->release();
+        auto data      = contents.data.release();
+        auto null_mask = contents.null_mask.release();
+
+        auto uint8_col = std::make_unique<column>(data_type{type_id::UINT8},
+                                                  data->size(),
+                                                  std::move(*data),
+                                                  std::move(*null_mask),
+                                                  UNKNOWN_NULL_COUNT);
 
         if (schema_info != nullptr) {
           schema_info->children.push_back(column_name_info{"offsets"});
@@ -87,10 +127,10 @@ std::unique_ptr<column> make_column(column_buffer& buffer,
 
         return make_lists_column(
           num_rows,
-          std::move(col_contest.children[strings_column_view::offsets_column_index]),
-          std::move(col_contest.children[strings_column_view::chars_column_index]),
+          std::move(col_content.children[strings_column_view::offsets_column_index]),
+          std::move(uint8_col),
           UNKNOWN_NULL_COUNT,
-          std::move(*col_contest.null_mask));
+          std::move(*col_content.null_mask));
       }
 
     case type_id::LIST: {
diff --git a/cpp/src/io/utilities/column_buffer.hpp b/cpp/src/io/utilities/column_buffer.hpp
index 8ae3d39a3ba..8f181157fae 100644
--- a/cpp/src/io/utilities/column_buffer.hpp
+++ b/cpp/src/io/utilities/column_buffer.hpp
@@ -104,10 +104,14 @@ struct column_buffer {
   {
     return static_cast<T*>(_null_mask.data());
   }
-  auto null_mask_size() { return _null_mask.size(); };
+  auto null_mask_size() { return _null_mask.size(); }
 
   auto& null_count() { return _null_count; }
 
+  // Create a new column_buffer that has empty data but with the same basic information as the
+  // input column, including same type, nullability, name, and user_data.
+  static column_buffer empty_like(column_buffer const& input);
+
   std::unique_ptr<rmm::device_uvector<string_index_pair>> _strings;
   rmm::device_buffer _data{};
   rmm::device_buffer _null_mask{};
diff --git a/cpp/src/io/utilities/column_type_histogram.hpp b/cpp/src/io/utilities/column_type_histogram.hpp
index 8bd2d3a89cf..88f4e58f9b1 100644
--- a/cpp/src/io/utilities/column_type_histogram.hpp
+++ b/cpp/src/io/utilities/column_type_histogram.hpp
@@ -33,6 +33,11 @@ struct column_type_histogram {
   cudf::size_type positive_small_int_count{};
   cudf::size_type big_int_count{};
   cudf::size_type bool_count{};
+  auto total_count() const
+  {
+    return null_count + float_count + datetime_count + string_count + negative_small_int_count +
+           positive_small_int_count + big_int_count + bool_count;
+  }
 };
 
 }  // namespace io
diff --git a/cpp/src/io/utilities/column_utils.cuh b/cpp/src/io/utilities/column_utils.cuh
index fbeaaa9c0fc..598c93a1a4f 100644
--- a/cpp/src/io/utilities/column_utils.cuh
+++ b/cpp/src/io/utilities/column_utils.cuh
@@ -64,7 +64,7 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
     iter,
     iter + parent_table_device_view.num_columns(),
     [col_desc, parent_col_view = parent_table_device_view, leaf_columns] __device__(
-      size_type index) mutable {
+      size_type index) {
       col_desc[index].parent_column = parent_col_view.begin() + index;
       column_device_view col        = parent_col_view.column(index);
       // traverse till leaf column
@@ -74,7 +74,7 @@ rmm::device_uvector<column_device_view> create_leaf_column_device_views(
                              : col.child(0);
         // stop early if writing a byte array
         if (col_desc[index].stats_dtype == dtype_byte_array &&
-            (child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8)) {
+            child.type().id() == type_id::UINT8) {
           break;
         }
         col = child;
diff --git a/cpp/src/io/utilities/file_io_utilities.cpp b/cpp/src/io/utilities/file_io_utilities.cpp
index c0dd85702e2..2484a36143a 100644
--- a/cpp/src/io/utilities/file_io_utilities.cpp
+++ b/cpp/src/io/utilities/file_io_utilities.cpp
@@ -257,11 +257,20 @@ std::future<void> cufile_output_impl::write_async(void const* data, size_t offse
   // writes.
   return std::async(std::launch::deferred, waiter, std::move(slice_tasks));
 }
+#else
+cufile_input_impl::cufile_input_impl(std::string const& filepath)
+{
+  CUDF_FAIL("Cannot create cuFile source, current build was compiled without cuFile headers");
+}
+
+cufile_output_impl::cufile_output_impl(std::string const& filepath)
+{
+  CUDF_FAIL("Cannot create cuFile sink, current build was compiled without cuFile headers");
+}
 #endif
 
 std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath)
 {
-#ifdef CUFILE_FOUND
   if (cufile_integration::is_gds_enabled()) {
     try {
       return std::make_unique<cufile_input_impl>(filepath);
@@ -269,13 +278,11 @@ std::unique_ptr<cufile_input_impl> make_cufile_input(std::string const& filepath
       if (cufile_integration::is_always_enabled()) throw;
     }
   }
-#endif
   return nullptr;
 }
 
 std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepath)
 {
-#ifdef CUFILE_FOUND
   if (cufile_integration::is_gds_enabled()) {
     try {
       return std::make_unique<cufile_output_impl>(filepath);
@@ -283,7 +290,6 @@ std::unique_ptr<cufile_output_impl> make_cufile_output(std::string const& filepa
       if (cufile_integration::is_always_enabled()) throw;
     }
   }
-#endif
   return nullptr;
 }
 
diff --git a/cpp/src/io/utilities/file_io_utilities.hpp b/cpp/src/io/utilities/file_io_utilities.hpp
index 704ee77de8a..38674892966 100644
--- a/cpp/src/io/utilities/file_io_utilities.hpp
+++ b/cpp/src/io/utilities/file_io_utilities.hpp
@@ -194,6 +194,7 @@ class cufile_output_impl final : public cufile_output {
 
 class cufile_input_impl final : public cufile_input {
  public:
+  cufile_input_impl(std::string const& filepath);
   std::future<size_t> read_async(size_t offset,
                                  size_t size,
                                  uint8_t* dst,
@@ -205,6 +206,7 @@ class cufile_input_impl final : public cufile_input {
 
 class cufile_output_impl final : public cufile_output {
  public:
+  cufile_output_impl(std::string const& filepath);
   std::future<void> write_async(void const* data, size_t offset, size_t size) override
   {
     CUDF_FAIL("Only used to compile without cufile library, should not be called");
diff --git a/cpp/src/io/utilities/hostdevice_vector.hpp b/cpp/src/io/utilities/hostdevice_vector.hpp
index b5e59871119..77dade24009 100644
--- a/cpp/src/io/utilities/hostdevice_vector.hpp
+++ b/cpp/src/io/utilities/hostdevice_vector.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/detail/utilities/pinned_allocator.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/span.hpp>
@@ -24,7 +25,6 @@
 #include <rmm/device_buffer.hpp>
 
 #include <thrust/host_vector.h>
-#include <thrust/system/cuda/experimental/pinned_allocator.h>
 
 /**
  * @brief A helper class that wraps fixed-length device memory for the GPU, and
@@ -40,7 +40,7 @@ class hostdevice_vector {
  public:
   using value_type = T;
 
-  hostdevice_vector() : hostdevice_vector(0, cudf::default_stream_value) {}
+  hostdevice_vector() : hostdevice_vector(0, cudf::get_default_stream()) {}
 
   explicit hostdevice_vector(size_t size, rmm::cuda_stream_view stream)
     : hostdevice_vector(size, size, stream)
@@ -126,7 +126,7 @@ class hostdevice_vector {
   }
 
  private:
-  thrust::host_vector<T, thrust::system::cuda::experimental::pinned_allocator<T>> h_data;
+  thrust::host_vector<T, cudf::detail::pinned_allocator<T>> h_data;
   rmm::device_uvector<T> d_data;
 };
 
diff --git a/cpp/src/io/utilities/output_builder.cuh b/cpp/src/io/utilities/output_builder.cuh
new file mode 100644
index 00000000000..e45143480fc
--- /dev/null
+++ b/cpp/src/io/utilities/output_builder.cuh
@@ -0,0 +1,357 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf/types.hpp>
+#include <cudf/utilities/error.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+#include <rmm/exec_policy.hpp>
+#include <rmm/mr/device/device_memory_resource.hpp>
+
+#include <thrust/copy.h>
+
+#include <iterator>
+
+namespace cudf {
+
+template <typename T>
+class split_device_span_iterator;
+
+/**
+ * @brief A device span consisting of two separate device_spans acting as if they were part of a
+ * single span. The first head.size() entries are served from the first span, the remaining
+ * tail.size() entries are served from the second span.
+ *
+ * @tparam T The type of elements in the span.
+ */
+template <typename T>
+class split_device_span {
+ public:
+  using element_type    = T;
+  using value_type      = std::remove_cv<T>;
+  using size_type       = std::size_t;
+  using difference_type = std::ptrdiff_t;
+  using pointer         = T*;
+  using iterator        = split_device_span_iterator<T>;
+  using const_pointer   = T const*;
+  using reference       = T&;
+  using const_reference = T const&;
+
+  split_device_span() = default;
+
+  explicit constexpr split_device_span(device_span<T> head, device_span<T> tail = {})
+    : _head{head}, _tail{tail}
+  {
+  }
+
+  [[nodiscard]] constexpr reference operator[](size_type i) const
+  {
+    return i < _head.size() ? _head[i] : _tail[i - _head.size()];
+  }
+
+  [[nodiscard]] constexpr size_type size() const { return _head.size() + _tail.size(); }
+
+  [[nodiscard]] constexpr device_span<T> head() const { return _head; }
+
+  [[nodiscard]] constexpr device_span<T> tail() const { return _tail; }
+
+  [[nodiscard]] constexpr iterator begin() const;
+
+  [[nodiscard]] constexpr iterator end() const;
+
+ private:
+  device_span<T> _head;
+  device_span<T> _tail;
+};
+
+/**
+ * @brief A random access iterator indexing into a split_device_span.
+ *
+ * @tparam T The type of elements in the underlying span.
+ */
+template <typename T>
+class split_device_span_iterator {
+  using it = split_device_span_iterator;
+
+ public:
+  using size_type         = std::size_t;
+  using difference_type   = std::ptrdiff_t;
+  using value_type        = T;
+  using pointer           = value_type*;
+  using reference         = value_type&;
+  using iterator_category = std::random_access_iterator_tag;
+
+  split_device_span_iterator() = default;
+
+  constexpr split_device_span_iterator(split_device_span<T> span, size_type offset)
+    : _span{span}, _offset{offset}
+  {
+  }
+
+  [[nodiscard]] constexpr reference operator*() const { return _span[_offset]; }
+
+  [[nodiscard]] constexpr reference operator[](size_type i) const { return _span[_offset + i]; }
+
+  [[nodiscard]] constexpr friend bool operator==(const it& lhs, const it& rhs)
+  {
+    return lhs._offset == rhs._offset;
+  }
+
+  [[nodiscard]] constexpr friend bool operator!=(const it& lhs, const it& rhs)
+  {
+    return !(lhs == rhs);
+  }
+  [[nodiscard]] constexpr friend bool operator<(const it& lhs, const it& rhs)
+  {
+    return lhs._offset < rhs._offset;
+  }
+
+  [[nodiscard]] constexpr friend bool operator>=(const it& lhs, const it& rhs)
+  {
+    return !(lhs < rhs);
+  }
+
+  [[nodiscard]] constexpr friend bool operator>(const it& lhs, const it& rhs) { return rhs < lhs; }
+
+  [[nodiscard]] constexpr friend bool operator<=(const it& lhs, const it& rhs)
+  {
+    return !(lhs > rhs);
+  }
+
+  [[nodiscard]] constexpr friend difference_type operator-(const it& lhs, const it& rhs)
+  {
+    return lhs._offset - rhs._offset;
+  }
+
+  [[nodiscard]] constexpr friend it operator+(it lhs, difference_type i) { return lhs += i; }
+
+  constexpr it& operator+=(difference_type i)
+  {
+    _offset += i;
+    return *this;
+  }
+
+  constexpr it& operator-=(difference_type i) { return *this += -i; }
+
+  constexpr it& operator++() { return *this += 1; }
+
+  constexpr it& operator--() { return *this -= 1; }
+
+  constexpr it operator++(int)
+  {
+    auto result = *this;
+    ++*this;
+    return result;
+  }
+
+  constexpr it operator--(int)
+  {
+    auto result = *this;
+    --*this;
+    return result;
+  }
+
+ private:
+  split_device_span<T> _span;
+  size_type _offset;
+};
+
+template <typename T>
+[[nodiscard]] constexpr split_device_span_iterator<T> split_device_span<T>::begin() const
+{
+  return {*this, 0};
+}
+
+template <typename T>
+[[nodiscard]] constexpr split_device_span_iterator<T> split_device_span<T>::end() const
+{
+  return {*this, size()};
+}
+
+/**
+ * @brief A chunked storage class that provides preallocated memory for algorithms with known
+ * worst-case output size. It provides functionality to retrieve the next chunk to write to, for
+ * reporting how much memory was actually written and for gathering all previously written outputs
+ * into a single contiguous vector.
+ *
+ * @tparam T The output element type.
+ */
+template <typename T>
+class output_builder {
+ public:
+  using size_type = typename rmm::device_uvector<T>::size_type;
+
+  /**
+   * @brief Initializes an output builder with given worst-case output size and stream.
+   *
+   * @param max_write_size the maximum number of elements that will be written into a
+   *                       split_device_span returned from `next_output`.
+   * @param stream the stream used to allocate the first chunk of memory.
+   * @param mr optional, the memory resource to use for allocation.
+   */
+  output_builder(size_type max_write_size,
+                 size_type max_growth,
+                 rmm::cuda_stream_view stream,
+                 rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+    : _size{0}, _max_write_size{max_write_size}, _max_growth{max_growth}
+  {
+    CUDF_EXPECTS(max_write_size > 0, "Internal error");
+    _chunks.emplace_back(0, stream, mr);
+    _chunks.back().reserve(max_write_size * 2, stream);
+  }
+
+  output_builder(output_builder&&)      = delete;
+  output_builder(const output_builder&) = delete;
+  output_builder& operator=(output_builder&&) = delete;
+  output_builder& operator=(const output_builder&) = delete;
+
+  /**
+   * @brief Returns the next free chunk of `max_write_size` elements from the underlying storage.
+   * Must be followed by a call to `advance_output` after the memory has been written to.
+   *
+   * @param stream The stream to allocate a new chunk of memory with, if necessary.
+   *               This should be the stream that will write to the `split_device_span`.
+   * @return A `split_device_span` starting directly after the last output and providing at least
+   *         `max_write_size` entries of storage.
+   */
+  [[nodiscard]] split_device_span<T> next_output(rmm::cuda_stream_view stream)
+  {
+    auto head_it   = _chunks.end() - (_chunks.size() > 1 and _chunks.back().is_empty() ? 2 : 1);
+    auto head_span = get_free_span(*head_it);
+    if (head_span.size() >= _max_write_size) { return split_device_span<T>{head_span}; }
+    if (head_it == _chunks.end() - 1) {
+      // insert a new device_uvector of double size
+      auto const next_chunk_size =
+        std::min(_max_growth * _max_write_size, 2 * _chunks.back().capacity());
+      _chunks.emplace_back(0, stream, _chunks.back().memory_resource());
+      _chunks.back().reserve(next_chunk_size, stream);
+    }
+    auto tail_span = get_free_span(_chunks.back());
+    CUDF_EXPECTS(head_span.size() + tail_span.size() >= _max_write_size, "Internal error");
+    return split_device_span<T>{head_span, tail_span};
+  }
+
+  /**
+   * @brief Advances the output sizes after a `split_device_span` returned from `next_output` was
+   *        written to.
+   *
+   * @param actual_size The number of elements that were written to the result of the previous
+   *                    `next_output` call.
+   * @param stream The stream on which to resize the vectors. Since this function will not
+   *               reallocate, this only changes the stream of the internally stored vectors,
+   *               impacting their subsequent copy and destruction behavior.
+   */
+  void advance_output(size_type actual_size, rmm::cuda_stream_view stream)
+  {
+    CUDF_EXPECTS(actual_size <= _max_write_size, "Internal error");
+    if (_chunks.size() < 2) {
+      auto const new_size = _chunks.back().size() + actual_size;
+      inplace_resize(_chunks.back(), new_size, stream);
+    } else {
+      auto& tail              = _chunks.back();
+      auto& prev              = _chunks.rbegin()[1];
+      auto const prev_advance = std::min(actual_size, prev.capacity() - prev.size());
+      auto const tail_advance = actual_size - prev_advance;
+      inplace_resize(prev, prev.size() + prev_advance, stream);
+      inplace_resize(tail, tail.size() + tail_advance, stream);
+    }
+    _size += actual_size;
+  }
+
+  /**
+   * @brief Returns the first element that was written to the output.
+   *        Requires a previous call to `next_output` and `advance_output` and `size() > 0`.
+   * @param stream The stream used to access the element.
+   * @return The first element that was written to the output.
+   */
+  [[nodiscard]] T front_element(rmm::cuda_stream_view stream) const
+  {
+    return _chunks.front().front_element(stream);
+  }
+
+  /**
+   * @brief Returns the last element that was written to the output.
+   *        Requires a previous call to `next_output` and `advance_output` and `size() > 0`.
+   * @param stream The stream used to access the element.
+   * @return The last element that was written to the output.
+   */
+  [[nodiscard]] T back_element(rmm::cuda_stream_view stream) const
+  {
+    auto const& last_nonempty_chunk =
+      _chunks.size() > 1 and _chunks.back().is_empty() ? _chunks.rbegin()[1] : _chunks.back();
+    return last_nonempty_chunk.back_element(stream);
+  }
+
+  [[nodiscard]] size_type size() const { return _size; }
+
+  /**
+   * @brief Gathers all previously written outputs into a single contiguous vector.
+   *
+   * @param stream The stream used to allocate and gather the output vector. All previous write
+   *               operations to the output buffer must have finished or happened on this stream.
+   * @param mr The memory resource used to allocate the output vector.
+   * @return The output vector.
+   */
+  rmm::device_uvector<T> gather(rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr) const
+  {
+    rmm::device_uvector<T> output{size(), stream, mr};
+    auto output_it = output.begin();
+    for (auto const& chunk : _chunks) {
+      output_it = thrust::copy(
+        rmm::exec_policy_nosync(stream), chunk.begin(), chunk.begin() + chunk.size(), output_it);
+    }
+    return output;
+  }
+
+ private:
+  /**
+   * @brief Resizes a vector without reallocating
+   *
+   * @param vector The vector
+   * @param new_size The new size. Must be smaller than the vector's capacity
+   * @param stream The stream on which to resize the vector. Since this function will not
+   *               reallocate, this only changes the stream of `vector`, impacting its subsequent
+   *               copy and destruction behavior.
+   */
+  static void inplace_resize(rmm::device_uvector<T>& vector,
+                             size_type new_size,
+                             rmm::cuda_stream_view stream)
+  {
+    CUDF_EXPECTS(new_size <= vector.capacity(), "Internal error");
+    vector.resize(new_size, stream);
+  }
+
+  /**
+   * @brief Returns the span consisting of all currently unused elements in the vector
+   *        (`i >= size() and i < capacity()`).
+   *
+   * @param vector The vector.
+   * @return The span of unused elements.
+   */
+  static device_span<T> get_free_span(rmm::device_uvector<T>& vector)
+  {
+    return device_span<T>{vector.data() + vector.size(), vector.capacity() - vector.size()};
+  }
+
+  size_type _size;
+  size_type _max_write_size;
+  size_type _max_growth;
+  std::vector<rmm::device_uvector<T>> _chunks;
+};
+
+}  // namespace cudf
diff --git a/cpp/src/io/utilities/parsing_utils.cuh b/cpp/src/io/utilities/parsing_utils.cuh
index 388c9b28001..89806956ae5 100644
--- a/cpp/src/io/utilities/parsing_utils.cuh
+++ b/cpp/src/io/utilities/parsing_utils.cuh
@@ -117,8 +117,9 @@ struct parse_options {
 };
 
 /**
- * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
- * for integral types. Handles hexadecimal digits, both uppercase and lowercase.
+ * @brief Returns the numeric value of an ASCII/UTF-8 character.
+ * Handles hexadecimal digits, both uppercase and lowercase
+ * for integral types and only decimal digits for floating point types.
  * If the character is not a valid numeric digit then `0` is returned and
  * valid_flag is set to false.
  *
@@ -127,31 +128,14 @@ struct parse_options {
  *
  * @return uint8_t Numeric value of the character, or `0`
  */
-template <typename T, CUDF_ENABLE_IF(std::is_integral_v<T>)>
-constexpr uint8_t decode_digit(char c, bool* valid_flag)
-{
-  if (c >= '0' && c <= '9') return c - '0';
-  if (c >= 'a' && c <= 'f') return c - 'a' + 10;
-  if (c >= 'A' && c <= 'F') return c - 'A' + 10;
-
-  *valid_flag = false;
-  return 0;
-}
-
-/**
- * @brief Returns the numeric value of an ASCII/UTF-8 character. Specialization
- * for non-integral types. Handles only decimal digits. If the character is not
- * a valid numeric digit then `0` is returned and valid_flag is set to false.
- *
- * @param c ASCII or UTF-8 character
- * @param valid_flag Set to false if input is not valid. Unchanged otherwise.
- *
- * @return uint8_t Numeric value of the character, or `0`
- */
-template <typename T, CUDF_ENABLE_IF(!std::is_integral_v<T>)>
+template <typename T, bool as_hex = false>
 constexpr uint8_t decode_digit(char c, bool* valid_flag)
 {
   if (c >= '0' && c <= '9') return c - '0';
+  if constexpr (as_hex and std::is_integral_v<T>) {
+    if (c >= 'a' && c <= 'f') return c - 'a' + 10;
+    if (c >= 'A' && c <= 'F') return c - 'A' + 10;
+  }
 
   *valid_flag = false;
   return 0;
@@ -194,13 +178,13 @@ constexpr bool is_infinity(char const* begin, char const* end)
  * @return The parsed and converted value
  */
 template <typename T, int base = 10>
-constexpr T parse_numeric(char const* begin,
-                          char const* end,
-                          parse_options_view const& opts,
-                          T error_result = std::numeric_limits<T>::quiet_NaN())
+__host__ __device__ std::optional<T> parse_numeric(char const* begin,
+                                                   char const* end,
+                                                   parse_options_view const& opts)
 {
   T value{};
   bool all_digits_valid = true;
+  constexpr bool as_hex = (base == 16);
 
   // Handle negative values if necessary
   int32_t sign = (*begin == '-') ? -1 : 1;
@@ -223,7 +207,7 @@ constexpr T parse_numeric(char const* begin,
     } else if (base == 10 && (*begin == 'e' || *begin == 'E')) {
       break;
     } else if (*begin != opts.thousands && *begin != '+') {
-      value = (value * base) + decode_digit<T>(*begin, &all_digits_valid);
+      value = (value * base) + decode_digit<T, as_hex>(*begin, &all_digits_valid);
     }
     ++begin;
   }
@@ -237,7 +221,7 @@ constexpr T parse_numeric(char const* begin,
         break;
       } else if (*begin != opts.thousands && *begin != '+') {
         divisor /= base;
-        value += decode_digit<T>(*begin, &all_digits_valid) * divisor;
+        value += decode_digit<T, as_hex>(*begin, &all_digits_valid) * divisor;
       }
       ++begin;
     }
@@ -248,12 +232,12 @@ constexpr T parse_numeric(char const* begin,
       if (*begin == '-' || *begin == '+') { ++begin; }
       int32_t exponent = 0;
       while (begin < end) {
-        exponent = (exponent * 10) + decode_digit<T>(*(begin++), &all_digits_valid);
+        exponent = (exponent * 10) + decode_digit<T, as_hex>(*(begin++), &all_digits_valid);
       }
       if (exponent != 0) { value *= exp10(double(exponent * exponent_sign)); }
     }
   }
-  if (!all_digits_valid) { return error_result; }
+  if (!all_digits_valid) { return std::optional<T>{}; }
 
   return value * sign;
 }
@@ -485,7 +469,7 @@ cudf::size_type count_all_from_set(host_span<char const> data,
 /**
  * @brief Checks whether the given character is a whitespace character.
  *
- * @param[in] ch The character to check
+ * @param ch The character to check
  *
  * @return True if the input is whitespace, False otherwise
  */
@@ -503,9 +487,9 @@ __inline__ __device__ It skip_character(It const& it, char ch)
 /**
  * @brief Adjusts the range to ignore starting/trailing whitespace and quotation characters.
  *
- * @param[in] begin Pointer to the first character in the parsing range
- * @param[in] end pointer to the first character after the parsing range
- * @param[in] quotechar The character used to denote quotes; '\0' if none
+ * @param begin Pointer to the first character in the parsing range
+ * @param end Pointer to the first character after the parsing range
+ * @param quotechar The character used to denote quotes; '\0' if none
  *
  * @return Trimmed range
  */
@@ -524,62 +508,47 @@ __inline__ __device__ std::pair<char const*, char const*> trim_whitespaces_quote
 }
 
 /**
- * @brief Decodes a numeric value base on templated cudf type T with specified
- * base.
+ * @brief Adjusts the range to ignore starting/trailing whitespace characters.
  *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
+ * @param begin Pointer to the first character in the parsing range
+ * @param end Pointer to the first character after the parsing range
  *
- * @return The parsed numeric value
+ * @return Trimmed range
  */
-template <typename T, int base>
-__inline__ __device__ T decode_value(char const* begin,
-                                     char const* end,
-                                     parse_options_view const& opts)
+__inline__ __device__ std::pair<char const*, char const*> trim_whitespaces(char const* begin,
+                                                                           char const* end)
 {
-  return cudf::io::parse_numeric<T, base>(begin, end, opts);
+  auto not_whitespace = [] __device__(auto c) { return !is_whitespace(c); };
+
+  auto const trim_begin = thrust::find_if(thrust::seq, begin, end, not_whitespace);
+  auto const trim_end   = thrust::find_if(thrust::seq,
+                                        thrust::make_reverse_iterator(end),
+                                        thrust::make_reverse_iterator(trim_begin),
+                                        not_whitespace);
+
+  return {trim_begin, trim_end.base()};
 }
 
 /**
- * @brief Decodes a numeric value base on templated cudf type T
+ * @brief Adjusts the range to ignore starting/trailing quotation characters.
  *
- * @param[in] begin Beginning of the character string
- * @param[in] end End of the character string
- * @param opts The global parsing behavior options
+ * @param begin Pointer to the first character in the parsing range
+ * @param end Pointer to the first character after the parsing range
+ * @param quotechar The character used to denote quotes. Provide '\0' if no quotes should be
+ * trimmed.
  *
- * @return The parsed numeric value
+ * @return Trimmed range
  */
-template <typename T, CUDF_ENABLE_IF(!cudf::is_timestamp<T>() and !cudf::is_duration<T>())>
-__inline__ __device__ T decode_value(char const* begin,
-                                     char const* end,
-                                     parse_options_view const& opts)
-{
-  return cudf::io::parse_numeric<T>(begin, end, opts);
-}
-
-template <typename T, CUDF_ENABLE_IF(cudf::is_timestamp<T>())>
-__inline__ __device__ T decode_value(char const* begin,
-                                     char const* end,
-                                     parse_options_view const& opts)
-{
-  // If this is a string value, remove quotes
-  if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) {
-    thrust::advance(begin, 1);
-    thrust::advance(end, -1);
-  }
-  return to_timestamp<T>(begin, end, opts.dayfirst);
-}
-
-template <typename T, CUDF_ENABLE_IF(cudf::is_duration<T>())>
-__inline__ __device__ T decode_value(char const* begin, char const* end, parse_options_view const&)
+__inline__ __device__ std::pair<char const*, char const*> trim_quotes(char const* begin,
+                                                                      char const* end,
+                                                                      char quotechar)
 {
-  // If this is a string value, remove quotes
-  if ((thrust::distance(begin, end) >= 2 && *begin == '\"' && *thrust::prev(end) == '\"')) {
+  if ((thrust::distance(begin, end) >= 2 && *begin == quotechar &&
+       *thrust::prev(end) == quotechar)) {
     thrust::advance(begin, 1);
     thrust::advance(end, -1);
   }
-  return to_duration<T>(begin, end);
+  return {begin, end};
 }
 
 struct ConvertFunctor {
@@ -601,13 +570,15 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex = false)
   {
-    static_cast<T*>(out_buffer)[row] = [as_hex, &opts, begin, end]() -> T {
+    auto const value = [as_hex, &opts, begin, end]() -> std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
       if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return 1; }
       if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return 0; }
-      return as_hex ? decode_value<T, 16>(begin, end, opts) : decode_value<T>(begin, end, opts);
+      return as_hex ? cudf::io::parse_numeric<T, 16>(begin, end, opts)
+                    : cudf::io::parse_numeric<T>(begin, end, opts);
     }();
+    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
 
     return true;
   }
@@ -626,6 +597,7 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
+    // TODO decide what's invalid input and update parsing functions
     static_cast<device_storage_type_t<T>*>(out_buffer)[row] =
       [&opts, output_type, begin, end]() -> device_storage_type_t<T> {
       return strings::detail::parse_decimal<device_storage_type_t<T>>(
@@ -647,13 +619,18 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    static_cast<T*>(out_buffer)[row] = [&opts, begin, end]() {
+    auto const value = [&opts, begin, end]() -> std::optional<T> {
       // Check for user-specified true/false values
       auto const field_len = static_cast<size_t>(end - begin);
-      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) { return true; }
-      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) { return false; }
-      return decode_value<T>(begin, end, opts);
+      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
+        return static_cast<T>(true);
+      }
+      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) {
+        return static_cast<T>(false);
+      }
+      return cudf::io::parse_numeric<T>(begin, end, opts);
     }();
+    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
 
     return true;
   }
@@ -671,10 +648,20 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    T const value                    = decode_value<T>(begin, end, opts);
-    static_cast<T*>(out_buffer)[row] = value;
+    auto const value = [&opts, begin, end]() -> std::optional<T> {
+      // Check for user-specified true/false values
+      auto const field_len = static_cast<size_t>(end - begin);
+      if (serialized_trie_contains(opts.trie_true, {begin, field_len})) {
+        return static_cast<T>(true);
+      }
+      if (serialized_trie_contains(opts.trie_false, {begin, field_len})) {
+        return static_cast<T>(false);
+      }
+      return cudf::io::parse_numeric<T>(begin, end, opts);
+    }();
+    static_cast<T*>(out_buffer)[row] = value.value_or(std::numeric_limits<T>::quiet_NaN());
 
-    return !std::isnan(value);
+    return value.has_value() and !std::isnan(*value);
   }
 
   /**
@@ -691,12 +678,15 @@ struct ConvertFunctor {
                                                       parse_options_view const& opts,
                                                       bool as_hex)
   {
-    if constexpr (cudf::is_timestamp<T>() or cudf::is_duration<T>()) {
-      static_cast<T*>(out_buffer)[row] = decode_value<T>(begin, end, opts);
-      return true;
+    // TODO decide what's invalid input and update parsing functions
+    if constexpr (cudf::is_timestamp<T>()) {
+      static_cast<T*>(out_buffer)[row] = to_timestamp<T>(begin, end, opts.dayfirst);
+    } else if constexpr (cudf::is_duration<T>()) {
+      static_cast<T*>(out_buffer)[row] = to_duration<T>(begin, end);
     } else {
       return false;
     }
+    return true;
   }
 };
 
diff --git a/cpp/src/join/conditional_join.cu b/cpp/src/join/conditional_join.cu
index f0b66559799..cf1476d8bcc 100644
--- a/cpp/src/join/conditional_join.cu
+++ b/cpp/src/join/conditional_join.cu
@@ -298,7 +298,7 @@ conditional_inner_join(table_view const& left,
                                   binary_predicate,
                                   detail::join_kind::INNER_JOIN,
                                   output_size,
-                                  cudf::default_stream_value,
+                                  cudf::get_default_stream(),
                                   mr);
 }
 
@@ -316,7 +316,7 @@ conditional_left_join(table_view const& left,
                                   binary_predicate,
                                   detail::join_kind::LEFT_JOIN,
                                   output_size,
-                                  cudf::default_stream_value,
+                                  cudf::get_default_stream(),
                                   mr);
 }
 
@@ -333,7 +333,7 @@ conditional_full_join(table_view const& left,
                                   binary_predicate,
                                   detail::join_kind::FULL_JOIN,
                                   {},
-                                  cudf::default_stream_value,
+                                  cudf::get_default_stream(),
                                   mr);
 }
 
@@ -350,7 +350,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_semi_join(
                                             binary_predicate,
                                             detail::join_kind::LEFT_SEMI_JOIN,
                                             output_size,
-                                            cudf::default_stream_value,
+                                            cudf::get_default_stream(),
                                             mr)
                      .first);
 }
@@ -368,7 +368,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> conditional_left_anti_join(
                                             binary_predicate,
                                             detail::join_kind::LEFT_ANTI_JOIN,
                                             output_size,
-                                            cudf::default_stream_value,
+                                            cudf::get_default_stream(),
                                             mr)
                      .first);
 }
@@ -380,7 +380,7 @@ std::size_t conditional_inner_join_size(table_view const& left,
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
-    left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::default_stream_value, mr);
+    left, right, binary_predicate, detail::join_kind::INNER_JOIN, cudf::get_default_stream(), mr);
 }
 
 std::size_t conditional_left_join_size(table_view const& left,
@@ -390,7 +390,7 @@ std::size_t conditional_left_join_size(table_view const& left,
 {
   CUDF_FUNC_RANGE();
   return detail::compute_conditional_join_output_size(
-    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::default_stream_value, mr);
+    left, right, binary_predicate, detail::join_kind::LEFT_JOIN, cudf::get_default_stream(), mr);
 }
 
 std::size_t conditional_left_semi_join_size(table_view const& left,
@@ -403,7 +403,7 @@ std::size_t conditional_left_semi_join_size(table_view const& left,
                                                                 right,
                                                                 binary_predicate,
                                                                 detail::join_kind::LEFT_SEMI_JOIN,
-                                                                cudf::default_stream_value,
+                                                                cudf::get_default_stream(),
                                                                 mr));
 }
 
@@ -417,7 +417,7 @@ std::size_t conditional_left_anti_join_size(table_view const& left,
                                                                 right,
                                                                 binary_predicate,
                                                                 detail::join_kind::LEFT_ANTI_JOIN,
-                                                                cudf::default_stream_value,
+                                                                cudf::get_default_stream(),
                                                                 mr));
 }
 
diff --git a/cpp/src/join/conditional_join.hpp b/cpp/src/join/conditional_join.hpp
index 6de2664b5f6..7c329cd8e17 100644
--- a/cpp/src/join/conditional_join.hpp
+++ b/cpp/src/join/conditional_join.hpp
@@ -48,7 +48,7 @@ conditional_join(table_view const& left,
                  ast::expression const& binary_predicate,
                  join_kind JoinKind,
                  std::optional<std::size_t> output_size = {},
-                 rmm::cuda_stream_view stream           = cudf::default_stream_value,
+                 rmm::cuda_stream_view stream           = cudf::get_default_stream(),
                  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource());
 
 /**
@@ -68,7 +68,7 @@ std::size_t compute_conditional_join_output_size(
   table_view const& right,
   ast::expression const& binary_predicate,
   join_kind JoinKind,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
 
 }  // namespace detail
diff --git a/cpp/src/join/cross_join.cu b/cpp/src/join/cross_join.cu
index 3eb9f1b1198..7358726d69d 100644
--- a/cpp/src/join/cross_join.cu
+++ b/cpp/src/join/cross_join.cu
@@ -78,7 +78,7 @@ std::unique_ptr<cudf::table> cross_join(cudf::table_view const& left,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::cross_join(left, right, cudf::default_stream_value, mr);
+  return detail::cross_join(left, right, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/join.cu b/cpp/src/join/join.cu
index bb8fc07c2d7..dbc543f4dcd 100644
--- a/cpp/src/join/join.cu
+++ b/cpp/src/join/join.cu
@@ -113,7 +113,7 @@ inner_join(table_view const& left,
            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::inner_join(left, right, compare_nulls, cudf::default_stream_value, mr);
+  return detail::inner_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -124,7 +124,7 @@ left_join(table_view const& left,
           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::left_join(left, right, compare_nulls, cudf::default_stream_value, mr);
+  return detail::left_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
 }
 
 std::pair<std::unique_ptr<rmm::device_uvector<size_type>>,
@@ -135,7 +135,7 @@ full_join(table_view const& left,
           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::full_join(left, right, compare_nulls, cudf::default_stream_value, mr);
+  return detail::full_join(left, right, compare_nulls, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/join/mixed_join.cu b/cpp/src/join/mixed_join.cu
index ec2dacaca5b..4cedfca218a 100644
--- a/cpp/src/join/mixed_join.cu
+++ b/cpp/src/join/mixed_join.cu
@@ -458,7 +458,7 @@ mixed_inner_join(
                             compare_nulls,
                             detail::join_kind::INNER_JOIN,
                             output_size_data,
-                            cudf::default_stream_value,
+                            cudf::get_default_stream(),
                             mr);
 }
 
@@ -479,7 +479,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_in
                                                 binary_predicate,
                                                 compare_nulls,
                                                 detail::join_kind::INNER_JOIN,
-                                                cudf::default_stream_value,
+                                                cudf::get_default_stream(),
                                                 mr);
 }
 
@@ -504,7 +504,7 @@ mixed_left_join(
                             compare_nulls,
                             detail::join_kind::LEFT_JOIN,
                             output_size_data,
-                            cudf::default_stream_value,
+                            cudf::get_default_stream(),
                             mr);
 }
 
@@ -525,7 +525,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
                                                 binary_predicate,
                                                 compare_nulls,
                                                 detail::join_kind::LEFT_JOIN,
-                                                cudf::default_stream_value,
+                                                cudf::get_default_stream(),
                                                 mr);
 }
 
@@ -550,7 +550,7 @@ mixed_full_join(
                             compare_nulls,
                             detail::join_kind::FULL_JOIN,
                             output_size_data,
-                            cudf::default_stream_value,
+                            cudf::get_default_stream(),
                             mr);
 }
 
diff --git a/cpp/src/join/mixed_join_semi.cu b/cpp/src/join/mixed_join_semi.cu
index a9897f0f40e..6ebf3702256 100644
--- a/cpp/src/join/mixed_join_semi.cu
+++ b/cpp/src/join/mixed_join_semi.cu
@@ -503,7 +503,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
                                                      binary_predicate,
                                                      compare_nulls,
                                                      detail::join_kind::LEFT_SEMI_JOIN,
-                                                     cudf::default_stream_value,
+                                                     cudf::get_default_stream(),
                                                      mr);
 }
 
@@ -526,7 +526,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_semi_join(
                                  compare_nulls,
                                  detail::join_kind::LEFT_SEMI_JOIN,
                                  output_size_data,
-                                 cudf::default_stream_value,
+                                 cudf::get_default_stream(),
                                  mr);
 }
 
@@ -547,7 +547,7 @@ std::pair<std::size_t, std::unique_ptr<rmm::device_uvector<size_type>>> mixed_le
                                                      binary_predicate,
                                                      compare_nulls,
                                                      detail::join_kind::LEFT_ANTI_JOIN,
-                                                     cudf::default_stream_value,
+                                                     cudf::get_default_stream(),
                                                      mr);
 }
 
@@ -570,7 +570,7 @@ std::unique_ptr<rmm::device_uvector<size_type>> mixed_left_anti_join(
                                  compare_nulls,
                                  detail::join_kind::LEFT_ANTI_JOIN,
                                  output_size_data,
-                                 cudf::default_stream_value,
+                                 cudf::get_default_stream(),
                                  mr);
 }
 
diff --git a/cpp/src/join/semi_join.cu b/cpp/src/join/semi_join.cu
index 87bac002f53..cc523b2ac7f 100644
--- a/cpp/src/join/semi_join.cu
+++ b/cpp/src/join/semi_join.cu
@@ -95,7 +95,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_semi_join(
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
-    detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::default_stream_value, mr);
+    detail::join_kind::LEFT_SEMI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
@@ -106,7 +106,7 @@ std::unique_ptr<rmm::device_uvector<cudf::size_type>> left_anti_join(
 {
   CUDF_FUNC_RANGE();
   return detail::left_semi_anti_join(
-    detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::default_stream_value, mr);
+    detail::join_kind::LEFT_ANTI_JOIN, left, right, compare_nulls, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/labeling/label_bins.cu b/cpp/src/labeling/label_bins.cu
index f5e35fc842f..4c3469c679e 100644
--- a/cpp/src/labeling/label_bins.cu
+++ b/cpp/src/labeling/label_bins.cu
@@ -244,7 +244,7 @@ std::unique_ptr<column> label_bins(column_view const& input,
                             left_inclusive,
                             right_edges,
                             right_inclusive,
-                            cudf::default_stream_value,
+                            cudf::get_default_stream(),
                             mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/lists/combine/concatenate_list_elements.cu b/cpp/src/lists/combine/concatenate_list_elements.cu
index c107bad018d..496d9ee670a 100644
--- a/cpp/src/lists/combine/concatenate_list_elements.cu
+++ b/cpp/src/lists/combine/concatenate_list_elements.cu
@@ -287,7 +287,7 @@ std::unique_ptr<column> concatenate_list_elements(column_view const& input,
                                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_list_elements(input, null_policy, cudf::default_stream_value, mr);
+  return detail::concatenate_list_elements(input, null_policy, cudf::get_default_stream(), mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/combine/concatenate_rows.cu b/cpp/src/lists/combine/concatenate_rows.cu
index 4364470407f..8b006548391 100644
--- a/cpp/src/lists/combine/concatenate_rows.cu
+++ b/cpp/src/lists/combine/concatenate_rows.cu
@@ -245,7 +245,8 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
            row_null_counts = row_null_counts.data()] __device__(size_t i) -> size_type {
             auto const row_index = i % num_rows;
             return row_null_counts[row_index] != num_columns;
-          });
+          },
+          stream);
       }
       // NULLIFY_OUTPUT_ROW.  Output row is nullfied if any input row is null
       return cudf::detail::valid_if(
@@ -255,7 +256,8 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
          row_null_counts = row_null_counts.data()] __device__(size_t i) -> size_type {
           auto const row_index = i % num_rows;
           return row_null_counts[row_index] == 0;
-        });
+        },
+        stream);
     }();
     concat->set_null_mask(std::move(null_mask), null_count);
   }
@@ -307,7 +309,7 @@ std::unique_ptr<column> concatenate_rows(table_view const& input,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::concatenate_rows(input, null_policy, cudf::default_stream_value, mr);
+  return detail::concatenate_rows(input, null_policy, cudf::get_default_stream(), mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/contains.cu b/cpp/src/lists/contains.cu
index 3a52426c16a..0142e736fd0 100644
--- a/cpp/src/lists/contains.cu
+++ b/cpp/src/lists/contains.cu
@@ -495,7 +495,7 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(lists, search_key, cudf::default_stream_value, mr);
+  return detail::contains(lists, search_key, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> contains(lists_column_view const& lists,
@@ -503,14 +503,14 @@ std::unique_ptr<column> contains(lists_column_view const& lists,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(lists, search_keys, cudf::default_stream_value, mr);
+  return detail::contains(lists, search_keys, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> contains_nulls(lists_column_view const& lists,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_nulls(lists, cudf::default_stream_value, mr);
+  return detail::contains_nulls(lists, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
@@ -519,7 +519,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::index_of(lists, search_key, find_option, cudf::default_stream_value, mr);
+  return detail::index_of(lists, search_key, find_option, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> index_of(lists_column_view const& lists,
@@ -528,7 +528,7 @@ std::unique_ptr<column> index_of(lists_column_view const& lists,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::index_of(lists, search_keys, find_option, cudf::default_stream_value, mr);
+  return detail::index_of(lists, search_keys, find_option, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/copying/gather.cu b/cpp/src/lists/copying/gather.cu
index ae9fab4dda2..eda46e05f18 100644
--- a/cpp/src/lists/copying/gather.cu
+++ b/cpp/src/lists/copying/gather.cu
@@ -100,36 +100,17 @@ std::unique_ptr<column> gather_list_leaf(column_view const& column,
   size_type gather_map_size = gd.gather_map_size;
 
   // call the normal gather
-  auto leaf_column = cudf::type_dispatcher<dispatch_storage_type>(
-    column.type(),
-    cudf::detail::column_gatherer{},
-    column,
-    gather_map_begin,
-    gather_map_begin + gather_map_size,
-    // note : we don't need to bother checking for out-of-bounds here since
-    // our inputs at this stage aren't coming from the user.
-    false,
-    stream,
-    mr);
-
-  // the column_gatherer doesn't create the null mask because it expects
-  // that will be done in the gather_bitmask() step.  however, gather_bitmask()
-  // only happens at the root level, and by definition this column is a
-  // leaf.  so we have to generate the bitmask ourselves.
-  // TODO : it might make sense to expose a gather() function that takes a column_view and
-  // returns a column that does this work correctly.
-  size_type null_count = column.null_count();
-  if (null_count > 0) {
-    auto list_cdv = column_device_view::create(column, stream);
-    auto validity = cudf::detail::valid_if(
-      gather_map_begin,
-      gather_map_begin + gd.gather_map_size,
-      [cdv = *list_cdv] __device__(int index) { return cdv.is_valid(index) ? true : false; },
-      stream,
-      mr);
-
-    leaf_column->set_null_mask(std::move(validity.first), validity.second);
-  }
+  // note : we don't need to bother checking for out-of-bounds here since
+  // our inputs at this stage aren't coming from the user.
+  auto gather_table = cudf::detail::gather(cudf::table_view({column}),
+                                           gather_map_begin,
+                                           gather_map_begin + gather_map_size,
+                                           out_of_bounds_policy::DONT_CHECK,
+                                           stream,
+                                           mr);
+  auto leaf_column  = std::move(gather_table->release().front());
+
+  if (column.null_count() == 0) { leaf_column->set_null_mask(rmm::device_buffer{}, 0); }
 
   return leaf_column;
 }
diff --git a/cpp/src/lists/copying/scatter_helper.cu b/cpp/src/lists/copying/scatter_helper.cu
index cbb3aec76c5..ca7ca2f6590 100644
--- a/cpp/src/lists/copying/scatter_helper.cu
+++ b/cpp/src/lists/copying/scatter_helper.cu
@@ -185,7 +185,7 @@ struct list_child_constructor {
                                                       mr);
 
     thrust::transform(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator(0),
       thrust::make_counting_iterator(child_column->size()),
       child_column->mutable_view().begin<T>(),
@@ -237,7 +237,7 @@ struct list_child_constructor {
     auto const null_string_view = string_view{nullptr, 0};  // placeholder for factory function
 
     thrust::transform(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(string_views.size()),
       string_views.begin(),
@@ -304,7 +304,7 @@ struct list_child_constructor {
     // For instance, if a parent list_device_view has 3 elements, it should have 3 corresponding
     // child list_device_view instances.
     thrust::transform(
-      rmm::exec_policy(stream),
+      rmm::exec_policy_nosync(stream),
       thrust::make_counting_iterator<size_type>(0),
       thrust::make_counting_iterator<size_type>(child_list_views.size()),
       child_list_views.begin(),
diff --git a/cpp/src/lists/copying/segmented_gather.cu b/cpp/src/lists/copying/segmented_gather.cu
index db37a82ba8e..2c12e09bcd9 100644
--- a/cpp/src/lists/copying/segmented_gather.cu
+++ b/cpp/src/lists/copying/segmented_gather.cu
@@ -120,7 +120,7 @@ std::unique_ptr<column> segmented_gather(lists_column_view const& source_column,
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_gather(
-    source_column, gather_map_list, bounds_policy, cudf::default_stream_value, mr);
+    source_column, gather_map_list, bounds_policy, cudf::get_default_stream(), mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/count_elements.cu b/cpp/src/lists/count_elements.cu
index 68748dfde3f..f8e7b4c6126 100644
--- a/cpp/src/lists/count_elements.cu
+++ b/cpp/src/lists/count_elements.cu
@@ -76,7 +76,7 @@ std::unique_ptr<column> count_elements(lists_column_view const& input,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_elements(input, cudf::default_stream_value, mr);
+  return detail::count_elements(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/dremel.cu b/cpp/src/lists/dremel.cu
index cb9cd4293b5..66134138a5c 100644
--- a/cpp/src/lists/dremel.cu
+++ b/cpp/src/lists/dremel.cu
@@ -192,8 +192,7 @@ dremel_data get_dremel_data(column_view h_col,
     }
     if (curr_col.type().id() == type_id::LIST) {
       auto child = curr_col.child(lists_column_view::child_column_index);
-      if ((child.type().id() == type_id::INT8 || child.type().id() == type_id::UINT8) &&
-          output_as_byte_array) {
+      if (output_as_byte_array && child.type().id() == type_id::UINT8) {
         // consider this the bottom
         break;
       }
@@ -225,6 +224,7 @@ dremel_data get_dremel_data(column_view h_col,
   cudf::detail::device_single_thread(
     [offset_at_level  = d_column_offsets.data(),
      end_idx_at_level = d_column_ends.data(),
+     level_max        = d_column_offsets.size(),
      col              = *d_col] __device__() {
       auto curr_col           = col;
       size_type off           = curr_col.offset();
@@ -239,9 +239,11 @@ dremel_data get_dremel_data(column_view h_col,
         if (curr_col.type().id() == type_id::LIST) {
           off = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(off);
           end = curr_col.child(lists_column_view::offsets_column_index).element<size_type>(end);
-          offset_at_level[level]  = off;
-          end_idx_at_level[level] = end;
-          ++level;
+          if (level < level_max) {
+            offset_at_level[level]  = off;
+            end_idx_at_level[level] = end;
+            ++level;
+          }
           curr_col = curr_col.child(lists_column_view::child_column_index);
         } else {
           curr_col = curr_col.child(0);
diff --git a/cpp/src/lists/explode.cu b/cpp/src/lists/explode.cu
index 873b0fe408d..4db3254f201 100644
--- a/cpp/src/lists/explode.cu
+++ b/cpp/src/lists/explode.cu
@@ -299,7 +299,7 @@ std::unique_ptr<table> explode(table_view const& input_table,
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode(input_table, explode_column_idx, cudf::default_stream_value, mr);
+  return detail::explode(input_table, explode_column_idx, cudf::get_default_stream(), mr);
 }
 
 /**
@@ -312,7 +312,7 @@ std::unique_ptr<table> explode_position(table_view const& input_table,
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
-  return detail::explode_position(input_table, explode_column_idx, cudf::default_stream_value, mr);
+  return detail::explode_position(input_table, explode_column_idx, cudf::get_default_stream(), mr);
 }
 
 /**
@@ -326,7 +326,7 @@ std::unique_ptr<table> explode_outer(table_view const& input_table,
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
   return detail::explode_outer(
-    input_table, explode_column_idx, false, cudf::default_stream_value, mr);
+    input_table, explode_column_idx, false, cudf::get_default_stream(), mr);
 }
 
 /**
@@ -341,7 +341,7 @@ std::unique_ptr<table> explode_outer_position(table_view const& input_table,
   CUDF_EXPECTS(input_table.column(explode_column_idx).type().id() == type_id::LIST,
                "Unsupported non-list column");
   return detail::explode_outer(
-    input_table, explode_column_idx, true, cudf::default_stream_value, mr);
+    input_table, explode_column_idx, true, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/lists/extract.cu b/cpp/src/lists/extract.cu
index bc04bad7c0c..d1807c2c5ac 100644
--- a/cpp/src/lists/extract.cu
+++ b/cpp/src/lists/extract.cu
@@ -171,7 +171,7 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_list_element(lists_column, index, cudf::default_stream_value, mr);
+  return detail::extract_list_element(lists_column, index, cudf::get_default_stream(), mr);
 }
 
 /**
@@ -186,7 +186,7 @@ std::unique_ptr<column> extract_list_element(lists_column_view const& lists_colu
   CUDF_FUNC_RANGE();
   CUDF_EXPECTS(indices.size() == lists_column.size(),
                "Index column must have as many elements as lists column.");
-  return detail::extract_list_element(lists_column, indices, cudf::default_stream_value, mr);
+  return detail::extract_list_element(lists_column, indices, cudf::get_default_stream(), mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/segmented_sort.cu b/cpp/src/lists/segmented_sort.cu
index ea35977e8e4..260636a61cf 100644
--- a/cpp/src/lists/segmented_sort.cu
+++ b/cpp/src/lists/segmented_sort.cu
@@ -15,12 +15,8 @@
  */
 
 #include <cudf/column/column.hpp>
-#include <cudf/column/column_device_view.cuh>
 #include <cudf/column/column_factories.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
-#include <cudf/detail/gather.cuh>
-#include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/sorting.hpp>
@@ -28,231 +24,27 @@
 #include <cudf/lists/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/error.hpp>
-#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
-#include <rmm/device_buffer.hpp>
-#include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/copy.h>
-#include <thrust/sequence.h>
 #include <thrust/transform.h>
 
-#include <cub/device/device_segmented_radix_sort.cuh>
-
 namespace cudf {
 namespace lists {
 namespace detail {
 
-struct SegmentedSortColumn {
-  /**
-   * @brief Compile time check for allowing radix sort for column type.
-   *
-   * Floating point is not included here because of the special handling of NaNs.
-   */
-  template <typename T>
-  static constexpr bool is_radix_sort_supported()
-  {
-    return std::is_integral<T>();
-  }
-
-  template <typename KeyT, typename ValueT, typename OffsetIteratorT>
-  void SortPairsAscending(KeyT const* keys_in,
-                          KeyT* keys_out,
-                          ValueT const* values_in,
-                          ValueT* values_out,
-                          int num_items,
-                          int num_segments,
-                          OffsetIteratorT begin_offsets,
-                          OffsetIteratorT end_offsets,
-                          rmm::cuda_stream_view stream)
-  {
-    rmm::device_buffer d_temp_storage;
-    size_t temp_storage_bytes = 0;
-    cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(),
-                                             temp_storage_bytes,
-                                             keys_in,
-                                             keys_out,
-                                             values_in,
-                                             values_out,
-                                             num_items,
-                                             num_segments,
-                                             begin_offsets,
-                                             end_offsets,
-                                             0,
-                                             sizeof(KeyT) * 8,
-                                             stream.value());
-    d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
-
-    cub::DeviceSegmentedRadixSort::SortPairs(d_temp_storage.data(),
-                                             temp_storage_bytes,
-                                             keys_in,
-                                             keys_out,
-                                             values_in,
-                                             values_out,
-                                             num_items,
-                                             num_segments,
-                                             begin_offsets,
-                                             end_offsets,
-                                             0,
-                                             sizeof(KeyT) * 8,
-                                             stream.value());
-  }
-
-  template <typename KeyT, typename ValueT, typename OffsetIteratorT>
-  void SortPairsDescending(KeyT const* keys_in,
-                           KeyT* keys_out,
-                           ValueT const* values_in,
-                           ValueT* values_out,
-                           int num_items,
-                           int num_segments,
-                           OffsetIteratorT begin_offsets,
-                           OffsetIteratorT end_offsets,
-                           rmm::cuda_stream_view stream)
-  {
-    rmm::device_buffer d_temp_storage;
-    size_t temp_storage_bytes = 0;
-    cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage.data(),
-                                                       temp_storage_bytes,
-                                                       keys_in,
-                                                       keys_out,
-                                                       values_in,
-                                                       values_out,
-                                                       num_items,
-                                                       num_segments,
-                                                       begin_offsets,
-                                                       end_offsets,
-                                                       0,
-                                                       sizeof(KeyT) * 8,
-                                                       stream.value());
-    d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
-
-    cub::DeviceSegmentedRadixSort::SortPairsDescending(d_temp_storage.data(),
-                                                       temp_storage_bytes,
-                                                       keys_in,
-                                                       keys_out,
-                                                       values_in,
-                                                       values_out,
-                                                       num_items,
-                                                       num_segments,
-                                                       begin_offsets,
-                                                       end_offsets,
-                                                       0,
-                                                       sizeof(KeyT) * 8,
-                                                       stream.value());
-  }
-
-  template <typename T>
-  std::enable_if_t<not is_radix_sort_supported<T>(), std::unique_ptr<column>> operator()(
-    column_view const& child,
-    column_view const& segment_offsets,
-    order column_order,
-    null_order null_precedence,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-    auto child_table = segmented_sort_by_key(table_view{{child}},
-                                             table_view{{child}},
-                                             segment_offsets,
-                                             {column_order},
-                                             {null_precedence},
-                                             stream,
-                                             mr);
-    return std::move(child_table->release().front());
-  }
-
-  template <typename T>
-  std::enable_if_t<is_radix_sort_supported<T>(), std::unique_ptr<column>> operator()(
-    column_view const& child,
-    column_view const& offsets,
-    order column_order,
-    null_order null_precedence,
-    rmm::cuda_stream_view stream,
-    rmm::mr::device_memory_resource* mr)
-  {
-    // the average list size at which to prefer radixsort:
-    constexpr cudf::size_type MIN_AVG_LIST_SIZE_FOR_RADIXSORT{100};
-
-    if ((child.size() / offsets.size()) < MIN_AVG_LIST_SIZE_FOR_RADIXSORT) {
-      auto child_table = segmented_sort_by_key(table_view{{child}},
-                                               table_view{{child}},
-                                               offsets,
-                                               {column_order},
-                                               {null_precedence},
-                                               stream,
-                                               mr);
-      return std::move(child_table->release().front());
-    }
-
-    auto output =
-      cudf::detail::allocate_like(child, child.size(), mask_allocation_policy::NEVER, stream, mr);
-    mutable_column_view mutable_output_view = output->mutable_view();
-
-    auto keys = [&]() {
-      if (child.nullable()) {
-        rmm::device_uvector<T> keys(child.size(), stream);
-        auto const null_replace_T = null_precedence == null_order::AFTER
-                                      ? std::numeric_limits<T>::max()
-                                      : std::numeric_limits<T>::min();
-
-        auto device_child = column_device_view::create(child, stream);
-        auto keys_in =
-          cudf::detail::make_null_replacement_iterator<T>(*device_child, null_replace_T);
-        thrust::copy_n(rmm::exec_policy(stream), keys_in, child.size(), keys.begin());
-        return keys;
-      }
-      return rmm::device_uvector<T>{0, stream};
-    }();
+namespace {
 
-    std::unique_ptr<column> sorted_indices = cudf::make_numeric_column(
-      data_type(type_to_id<size_type>()), child.size(), mask_state::UNALLOCATED, stream, mr);
-    mutable_column_view mutable_indices_view = sorted_indices->mutable_view();
-    thrust::sequence(rmm::exec_policy(stream),
-                     mutable_indices_view.begin<size_type>(),
-                     mutable_indices_view.end<size_type>(),
-                     0);
-
-    if (column_order == order::ASCENDING)
-      SortPairsAscending(child.nullable() ? keys.data() : child.begin<T>(),
-                         mutable_output_view.begin<T>(),
-                         mutable_indices_view.begin<size_type>(),
-                         mutable_indices_view.begin<size_type>(),
-                         child.size(),
-                         offsets.size() - 1,
-                         offsets.begin<size_type>(),
-                         offsets.begin<size_type>() + 1,
-                         stream);
-    else
-      SortPairsDescending(child.nullable() ? keys.data() : child.begin<T>(),
-                          mutable_output_view.begin<T>(),
-                          mutable_indices_view.begin<size_type>(),
-                          mutable_indices_view.begin<size_type>(),
-                          child.size(),
-                          offsets.size() - 1,
-                          offsets.begin<size_type>(),
-                          offsets.begin<size_type>() + 1,
-                          stream);
-    std::vector<std::unique_ptr<column>> output_cols;
-    output_cols.push_back(std::move(output));
-    // rearrange the null_mask.
-    cudf::detail::gather_bitmask(cudf::table_view{{child}},
-                                 mutable_indices_view.begin<size_type>(),
-                                 output_cols,
-                                 cudf::detail::gather_bitmask_op::DONT_CHECK,
-                                 stream,
-                                 mr);
-    return std::move(output_cols.front());
-  }
-};
-
-std::unique_ptr<column> sort_lists(lists_column_view const& input,
-                                   order column_order,
-                                   null_order null_precedence,
-                                   rmm::cuda_stream_view stream,
-                                   rmm::mr::device_memory_resource* mr)
+/**
+ * @brief Create output offsets for segmented sort
+ *
+ * This creates a normalized set of offsets from the offsets child column of the input.
+ */
+std::unique_ptr<column> build_output_offsets(lists_column_view const& input,
+                                             rmm::cuda_stream_view stream,
+                                             rmm::mr::device_memory_resource* mr)
 {
-  if (input.is_empty()) return empty_like(input.parent());
   auto output_offset = make_numeric_column(
     input.offsets().type(), input.size() + 1, mask_state::UNALLOCATED, stream, mr);
   thrust::transform(rmm::exec_policy(stream),
@@ -262,25 +54,35 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                     [first = input.offsets_begin()] __device__(auto offset_index) {
                       return offset_index - *first;
                     });
-  // for numeric columns, calls Faster segmented radix sort path
-  // for non-numeric columns, calls segmented_sort_by_key.
-  auto output_child = type_dispatcher<dispatch_storage_type>(input.child().type(),
-                                                             SegmentedSortColumn{},
-                                                             input.get_sliced_child(stream),
-                                                             output_offset->view(),
-                                                             column_order,
-                                                             null_precedence,
-                                                             stream,
-                                                             mr);
+  return output_offset;
+}
+
+}  // namespace
+
+std::unique_ptr<column> sort_lists(lists_column_view const& input,
+                                   order column_order,
+                                   null_order null_precedence,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  if (input.is_empty()) return empty_like(input.parent());
+
+  auto output_offset = build_output_offsets(input, stream, mr);
+  auto const child   = input.get_sliced_child(stream);
 
-  auto null_mask = cudf::detail::copy_bitmask(input.parent(), stream, mr);
+  auto const sorted_child_table = segmented_sort_by_key(table_view{{child}},
+                                                        table_view{{child}},
+                                                        output_offset->view(),
+                                                        {column_order},
+                                                        {null_precedence},
+                                                        stream,
+                                                        mr);
 
-  // Assemble list column & return
   return make_lists_column(input.size(),
                            std::move(output_offset),
-                           std::move(output_child),
+                           std::move(sorted_child_table->release().front()),
                            input.null_count(),
-                           std::move(null_mask),
+                           cudf::detail::copy_bitmask(input.parent(), stream, mr),
                            stream,
                            mr);
 }
@@ -293,17 +95,9 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
 {
   if (input.is_empty()) { return empty_like(input.parent()); }
 
-  auto output_offset = make_numeric_column(
-    input.offsets().type(), input.size() + 1, mask_state::UNALLOCATED, stream, mr);
-  thrust::transform(rmm::exec_policy(stream),
-                    input.offsets_begin(),
-                    input.offsets_end(),
-                    output_offset->mutable_view().template begin<size_type>(),
-                    [first = input.offsets_begin()] __device__(auto offset_index) {
-                      return offset_index - *first;
-                    });
+  auto output_offset = build_output_offsets(input, stream, mr);
+  auto const child   = input.get_sliced_child(stream);
 
-  auto const child              = input.get_sliced_child(stream);
   auto const sorted_child_table = stable_segmented_sort_by_key(table_view{{child}},
                                                                table_view{{child}},
                                                                output_offset->view(),
@@ -328,7 +122,7 @@ std::unique_ptr<column> sort_lists(lists_column_view const& input,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort_lists(input, column_order, null_precedence, cudf::default_stream_value, mr);
+  return detail::sort_lists(input, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
@@ -338,7 +132,7 @@ std::unique_ptr<column> stable_sort_lists(lists_column_view const& input,
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort_lists(
-    input, column_order, null_precedence, cudf::default_stream_value, mr);
+    input, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 }  // namespace lists
diff --git a/cpp/src/lists/sequences.cu b/cpp/src/lists/sequences.cu
index 6c2b0b1a785..bb0e669339a 100644
--- a/cpp/src/lists/sequences.cu
+++ b/cpp/src/lists/sequences.cu
@@ -214,7 +214,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequences(starts, sizes, cudf::default_stream_value, mr);
+  return detail::sequences(starts, sizes, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> sequences(column_view const& starts,
@@ -223,7 +223,7 @@ std::unique_ptr<column> sequences(column_view const& starts,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sequences(starts, steps, sizes, cudf::default_stream_value, mr);
+  return detail::sequences(starts, steps, sizes, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/set_operations.cu b/cpp/src/lists/set_operations.cu
index 00cdfcf7ff1..a31b7c6e5be 100644
--- a/cpp/src/lists/set_operations.cu
+++ b/cpp/src/lists/set_operations.cu
@@ -17,7 +17,7 @@
 #include "utilities.hpp"
 
 #include <cudf/column/column_factories.hpp>
-#include <cudf/detail/copy.cuh>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/copy_if.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
@@ -176,9 +176,8 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
                                   stream,
                                   mr);
 
-  return null_count == 0
-           ? std::move(output)
-           : cudf::detail::purge_nonempty_nulls(lists_column_view{output->view()}, stream, mr);
+  return null_count == 0 ? std::move(output)
+                         : cudf::detail::purge_nonempty_nulls(output->view(), stream, mr);
 }
 
 std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
@@ -253,9 +252,8 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
                                   stream,
                                   mr);
 
-  return null_count == 0
-           ? std::move(output)
-           : cudf::detail::purge_nonempty_nulls(lists_column_view{output->view()}, stream, mr);
+  return null_count == 0 ? std::move(output)
+                         : cudf::detail::purge_nonempty_nulls(output->view(), stream, mr);
 }
 
 }  // namespace detail
@@ -267,7 +265,7 @@ std::unique_ptr<column> have_overlap(lists_column_view const& lhs,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, cudf::default_stream_value, mr);
+  return detail::have_overlap(lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
@@ -278,7 +276,7 @@ std::unique_ptr<column> intersect_distinct(lists_column_view const& lhs,
 {
   CUDF_FUNC_RANGE();
   return detail::intersect_distinct(
-    lhs, rhs, nulls_equal, nans_equal, cudf::default_stream_value, mr);
+    lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
@@ -288,7 +286,7 @@ std::unique_ptr<column> union_distinct(lists_column_view const& lhs,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, cudf::default_stream_value, mr);
+  return detail::union_distinct(lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
@@ -299,7 +297,7 @@ std::unique_ptr<column> difference_distinct(lists_column_view const& lhs,
 {
   CUDF_FUNC_RANGE();
   return detail::difference_distinct(
-    lhs, rhs, nulls_equal, nans_equal, cudf::default_stream_value, mr);
+    lhs, rhs, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
index c99486ca8b0..c1c17dc0688 100644
--- a/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/lists/stream_compaction/apply_boolean_mask.cu
@@ -104,7 +104,7 @@ std::unique_ptr<column> apply_boolean_mask(lists_column_view const& input,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::apply_boolean_mask(input, boolean_mask, cudf::default_stream_value, mr);
+  return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/lists/stream_compaction/distinct.cu b/cpp/src/lists/stream_compaction/distinct.cu
index c88209292de..d0e4557663e 100644
--- a/cpp/src/lists/stream_compaction/distinct.cu
+++ b/cpp/src/lists/stream_compaction/distinct.cu
@@ -78,7 +78,7 @@ std::unique_ptr<column> distinct(lists_column_view const& input,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct(input, nulls_equal, nans_equal, cudf::default_stream_value, mr);
+  return detail::distinct(input, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf::lists
diff --git a/cpp/src/merge/merge.cu b/cpp/src/merge/merge.cu
index 91018d3f006..d9c573e8155 100644
--- a/cpp/src/merge/merge.cu
+++ b/cpp/src/merge/merge.cu
@@ -171,7 +171,7 @@ index_vector generate_merged_indices(table_view const& left_table,
                                      std::vector<order> const& column_order,
                                      std::vector<null_order> const& null_precedence,
                                      bool nullable                = true,
-                                     rmm::cuda_stream_view stream = cudf::default_stream_value)
+                                     rmm::cuda_stream_view stream = cudf::get_default_stream())
 {
   const size_type left_size  = left_table.num_rows();
   const size_type right_size = right_table.num_rows();
@@ -540,7 +540,7 @@ std::unique_ptr<cudf::table> merge(std::vector<table_view> const& tables_to_merg
 {
   CUDF_FUNC_RANGE();
   return detail::merge(
-    tables_to_merge, key_cols, column_order, null_precedence, cudf::default_stream_value, mr);
+    tables_to_merge, key_cols, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/partitioning/partitioning.cu b/cpp/src/partitioning/partitioning.cu
index 3e0cc26dcdd..cbe65354696 100644
--- a/cpp/src/partitioning/partitioning.cu
+++ b/cpp/src/partitioning/partitioning.cu
@@ -17,6 +17,7 @@
 #include <cub/cub.cuh>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
+#include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
 #include <cudf/detail/scatter.cuh>
 #include <cudf/detail/utilities/cuda.cuh>
@@ -436,15 +437,13 @@ struct copy_block_partitions_dispatcher {
                                          grid_size,
                                          stream);
 
-    // Use gather instead for non-fixed width types
-    return type_dispatcher(input.type(),
-                           detail::column_gatherer{},
-                           input,
-                           gather_map.begin(),
-                           gather_map.end(),
-                           false,
-                           stream,
-                           mr);
+    auto gather_table = cudf::detail::gather(cudf::table_view({input}),
+                                             gather_map,
+                                             out_of_bounds_policy::DONT_CHECK,
+                                             cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                             stream,
+                                             mr);
+    return std::move(gather_table->release().front());
   }
 };
 
@@ -610,7 +609,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> hash_partition_table(
 
     // Use the resulting scatter map to materialize the output
     auto output = detail::scatter(
-      input, row_partition_numbers.begin(), row_partition_numbers.end(), input, false, stream, mr);
+      input, row_partition_numbers.begin(), row_partition_numbers.end(), input, stream, mr);
 
     stream.synchronize();  // Async D2H copy must finish before returning host vec
     return std::pair(std::move(output), std::move(partition_offsets));
@@ -698,7 +697,7 @@ struct dispatch_map_type {
 
     // Scatter the rows into their partitions
     auto scattered =
-      cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, false, stream, mr);
+      cudf::detail::scatter(t, scatter_map.begin(), scatter_map.end(), t, stream, mr);
 
     return std::pair(std::move(scattered), std::move(partition_offsets));
   }
@@ -797,7 +796,7 @@ std::pair<std::unique_ptr<table>, std::vector<size_type>> partition(
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::partition(t, partition_map, num_partitions, cudf::default_stream_value, mr);
+  return detail::partition(t, partition_map, num_partitions, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/partitioning/round_robin.cu b/cpp/src/partitioning/round_robin.cu
index d455df3e890..990992cd8f2 100644
--- a/cpp/src/partitioning/round_robin.cu
+++ b/cpp/src/partitioning/round_robin.cu
@@ -20,7 +20,6 @@
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/null_mask.hpp>
-#include <cudf/table/row_operators.cuh>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/types.hpp>
@@ -153,7 +152,7 @@ std::pair<std::unique_ptr<table>, std::vector<cudf::size_type>> round_robin_part
   table_view const& input,
   cudf::size_type num_partitions,
   cudf::size_type start_partition     = 0,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto nrows = input.num_rows();
@@ -272,7 +271,7 @@ std::pair<std::unique_ptr<cudf::table>, std::vector<cudf::size_type>> round_robi
 {
   CUDF_FUNC_RANGE();
   return detail::round_robin_partition(
-    input, num_partitions, start_partition, cudf::default_stream_value, mr);
+    input, num_partitions, start_partition, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/quantiles/quantile.cu b/cpp/src/quantiles/quantile.cu
index 1fe9809d922..1f1941529c9 100644
--- a/cpp/src/quantiles/quantile.cu
+++ b/cpp/src/quantiles/quantile.cu
@@ -189,7 +189,7 @@ std::unique_ptr<column> quantile(column_view const& input,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::quantile(input, q, interp, ordered_indices, exact, cudf::default_stream_value, mr);
+  return detail::quantile(input, q, interp, ordered_indices, exact, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/quantiles/quantiles.cu b/cpp/src/quantiles/quantiles.cu
index c6957482f05..e3e19eaeec4 100644
--- a/cpp/src/quantiles/quantiles.cu
+++ b/cpp/src/quantiles/quantiles.cu
@@ -83,12 +83,12 @@ std::unique_ptr<table> quantiles(table_view const& input,
                              thrust::make_counting_iterator<size_type>(0),
                              q,
                              interp,
-                             cudf::default_stream_value,
+                             cudf::get_default_stream(),
                              mr);
   } else {
     auto sorted_idx = detail::sorted_order(input, column_order, null_precedence);
     return detail::quantiles(
-      input, sorted_idx->view().data<size_type>(), q, interp, cudf::default_stream_value, mr);
+      input, sorted_idx->view().data<size_type>(), q, interp, cudf::get_default_stream(), mr);
   }
 }
 
@@ -109,7 +109,7 @@ std::unique_ptr<table> quantiles(table_view const& input,
                            is_input_sorted,
                            column_order,
                            null_precedence,
-                           cudf::default_stream_value,
+                           cudf::get_default_stream(),
                            mr);
 }
 
diff --git a/cpp/src/quantiles/tdigest/tdigest.cu b/cpp/src/quantiles/tdigest/tdigest.cu
index a11d7ab6646..0c90b0af8d2 100644
--- a/cpp/src/quantiles/tdigest/tdigest.cu
+++ b/cpp/src/quantiles/tdigest/tdigest.cu
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
+#include <quantiles/tdigest/tdigest_util.cuh>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -42,8 +43,8 @@
 using namespace cudf::tdigest;
 
 namespace cudf {
-namespace detail {
 namespace tdigest {
+namespace detail {
 
 // https://developer.nvidia.com/blog/lerp-faster-cuda/
 template <typename T>
@@ -338,7 +339,7 @@ std::unique_ptr<scalar> make_empty_tdigest_scalar(rmm::cuda_stream_view stream,
     std::move(*std::make_unique<table>(std::move(contents.children))), true, stream, mr);
 }
 
-}  // namespace tdigest
+}  // namespace detail
 
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
@@ -354,8 +355,8 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
     data_type{type_id::INT32}, input.size() + 1, mask_state::UNALLOCATED, stream, mr);
   auto const all_empty_rows =
     thrust::count_if(rmm::exec_policy(stream),
-                     input.size_begin(),
-                     input.size_begin() + input.size(),
+                     detail::size_begin(input),
+                     detail::size_begin(input) + input.size(),
                      [] __device__(auto const x) { return x == 0; }) == input.size();
   auto row_size_iter = thrust::make_constant_iterator(all_empty_rows ? 0 : percentiles.size());
   thrust::exclusive_scan(rmm::exec_policy(stream),
@@ -379,7 +380,7 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
   // uninitialized)
   auto [bitmask, null_count] = [stream, mr, &tdv]() {
     auto tdigest_is_empty = thrust::make_transform_iterator(
-      tdv.size_begin(),
+      detail::size_begin(tdv),
       [] __device__(size_type tdigest_size) -> size_type { return tdigest_size == 0; });
     auto const null_count =
       thrust::reduce(rmm::exec_policy(stream), tdigest_is_empty, tdigest_is_empty + tdv.size(), 0);
@@ -390,24 +391,23 @@ std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
       tdigest_is_empty, tdigest_is_empty + tdv.size(), thrust::logical_not{}, stream, mr);
   }();
 
-  return cudf::make_lists_column(
-    input.size(),
-    std::move(offsets),
-    tdigest::compute_approx_percentiles(input, percentiles, stream, mr),
-    null_count,
-    std::move(bitmask),
-    stream,
-    mr);
+  return cudf::make_lists_column(input.size(),
+                                 std::move(offsets),
+                                 detail::compute_approx_percentiles(input, percentiles, stream, mr),
+                                 null_count,
+                                 std::move(bitmask),
+                                 stream,
+                                 mr);
 }
 
-}  // namespace detail
+}  // namespace tdigest
 
 std::unique_ptr<column> percentile_approx(tdigest_column_view const& input,
                                           column_view const& percentiles,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::percentile_approx(input, percentiles, cudf::default_stream_value, mr);
+  return tdigest::percentile_approx(input, percentiles, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
index d870b73dff4..38c6cf7bd2e 100644
--- a/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
+++ b/cpp/src/quantiles/tdigest/tdigest_aggregation.cu
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <quantiles/tdigest/tdigest_util.cuh>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
@@ -26,7 +28,6 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/detail/utilities/cuda.cuh>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
 #include <cudf/utilities/span.hpp>
 
 #include <rmm/device_uvector.hpp>
@@ -52,10 +53,8 @@
 #include <thrust/tuple.h>
 
 namespace cudf {
-namespace detail {
 namespace tdigest {
-
-using namespace cudf::tdigest;
+namespace detail {
 
 namespace {
 
@@ -596,7 +595,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
 
   // if there are no stub tdigests, we can return immediately.
   if (num_stubs == 0) {
-    return cudf::detail::tdigest::make_tdigest_column(num_rows,
+    return cudf::tdigest::detail::make_tdigest_column(num_rows,
                                                       std::move(means),
                                                       std::move(weights),
                                                       std::move(offsets),
@@ -642,7 +641,7 @@ std::unique_ptr<column> build_output_column(size_type num_rows,
                          0);
 
   // assemble final column
-  return cudf::detail::tdigest::make_tdigest_column(num_rows,
+  return cudf::tdigest::detail::make_tdigest_column(num_rows,
                                                     std::move(_means),
                                                     std::move(_weights),
                                                     std::move(offsets),
@@ -708,7 +707,7 @@ std::unique_ptr<column> compute_tdigests(int delta,
   //   double       // max
   // }
   //
-  if (total_clusters == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
+  if (total_clusters == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   // each input group represents an individual tdigest.  within each tdigest, we want the keys
   // to represent cluster indices (for example, if a tdigest had 100 clusters, the keys should fall
@@ -1067,9 +1066,10 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
   // generate min and max values
   auto merged_min_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
-  auto min_iter = thrust::make_transform_iterator(
-    thrust::make_zip_iterator(thrust::make_tuple(tdv.min_begin(), tdv.size_begin())),
-    tdigest_min{});
+  auto min_iter =
+    thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(
+                                      tdv.min_begin(), cudf::tdigest::detail::size_begin(tdv))),
+                                    tdigest_min{});
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels,
                         group_labels + num_group_labels,
@@ -1081,9 +1081,10 @@ std::unique_ptr<column> merge_tdigests(tdigest_column_view const& tdv,
 
   auto merged_max_col = cudf::make_numeric_column(
     data_type{type_id::FLOAT64}, num_groups, mask_state::UNALLOCATED, stream, mr);
-  auto max_iter = thrust::make_transform_iterator(
-    thrust::make_zip_iterator(thrust::make_tuple(tdv.max_begin(), tdv.size_begin())),
-    tdigest_max{});
+  auto max_iter =
+    thrust::make_transform_iterator(thrust::make_zip_iterator(thrust::make_tuple(
+                                      tdv.max_begin(), cudf::tdigest::detail::size_begin(tdv))),
+                                    tdigest_max{});
   thrust::reduce_by_key(rmm::exec_policy(stream),
                         group_labels,
                         group_labels + num_group_labels,
@@ -1190,7 +1191,7 @@ std::unique_ptr<scalar> reduce_tdigest(column_view const& col,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
 {
-  if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_scalar(stream, mr); }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
   // since this isn't coming out of a groupby, we need to sort the inputs in ascending
   // order with nulls at the end.
@@ -1209,7 +1210,7 @@ std::unique_ptr<scalar> reduce_merge_tdigest(column_view const& input,
 {
   tdigest_column_view tdv(input);
 
-  if (input.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_scalar(stream, mr); }
+  if (input.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_scalar(stream, mr); }
 
   auto h_group_offsets = cudf::detail::make_counting_transform_iterator(
     0, [size = input.size()](size_type i) { return i == 0 ? 0 : size; });
@@ -1238,7 +1239,7 @@ std::unique_ptr<column> group_tdigest(column_view const& col,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
 {
-  if (col.size() == 0) { return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr); }
+  if (col.size() == 0) { return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr); }
 
   auto const delta = max_centroids;
   return cudf::type_dispatcher(col.type(),
@@ -1264,7 +1265,7 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
   tdigest_column_view tdv(input);
 
   if (num_groups == 0 || input.size() == 0) {
-    return cudf::detail::tdigest::make_empty_tdigest_column(stream, mr);
+    return cudf::tdigest::detail::make_empty_tdigest_column(stream, mr);
   }
 
   // bring group offsets back to the host
@@ -1286,6 +1287,6 @@ std::unique_ptr<column> group_merge_tdigest(column_view const& input,
                         mr);
 }
 
-}  // namespace tdigest
 }  // namespace detail
+}  // namespace tdigest
 }  // namespace cudf
diff --git a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
index a86b40fd64a..cfcd21c5690 100644
--- a/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
+++ b/cpp/src/quantiles/tdigest/tdigest_column_view.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -17,13 +17,11 @@
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/lists/lists_column_view.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 
 namespace cudf {
 namespace tdigest {
 
-using namespace cudf;
-
 tdigest_column_view::tdigest_column_view(column_view const& col) : column_view(col)
 {
   // sanity check that this is actually tdigest data
diff --git a/cpp/src/quantiles/tdigest/tdigest_util.cuh b/cpp/src/quantiles/tdigest/tdigest_util.cuh
new file mode 100644
index 00000000000..d0e6484875b
--- /dev/null
+++ b/cpp/src/quantiles/tdigest/tdigest_util.cuh
@@ -0,0 +1,56 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/detail/iterator.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
+
+namespace cudf {
+namespace tdigest {
+namespace detail {
+
+/**
+ * @brief Functor to compute the size of each tdigest of a column
+ */
+struct tdigest_size_fn {
+  size_type const* offsets;  ///< Offsets of the t-digest column
+  /**
+   * @brief Returns size of the each tdigest in the column
+   *
+   * @param tdigest_index Index of the tdigest in the column
+   * @return Size of the tdigest
+   */
+  __device__ size_type operator()(size_type tdigest_index)
+  {
+    return offsets[tdigest_index + 1] - offsets[tdigest_index];
+  }
+};
+
+/**
+ * @brief Returns an iterator that returns the size of each tdigest
+ * in the column (each row is 1 digest)
+ *
+ * @return An iterator that returns the size of each tdigest in the column
+ */
+inline auto size_begin(tdigest_column_view const& tdv)
+{
+  return cudf::detail::make_counting_transform_iterator(
+    0, tdigest_size_fn{tdv.centroids().offsets_begin()});
+}
+
+}  // namespace detail
+}  // namespace tdigest
+}  // namespace cudf
diff --git a/cpp/src/reductions/minmax.cu b/cpp/src/reductions/minmax.cu
index e69942552ff..603e13c1894 100644
--- a/cpp/src/reductions/minmax.cu
+++ b/cpp/src/reductions/minmax.cu
@@ -277,7 +277,7 @@ std::pair<std::unique_ptr<scalar>, std::unique_ptr<scalar>> minmax(
   const column_view& col, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::minmax(col, cudf::default_stream_value, mr);
+  return detail::minmax(col, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/reductions.cpp b/cpp/src/reductions/reductions.cpp
index 523865e0df0..38db7eb3e89 100644
--- a/cpp/src/reductions/reductions.cpp
+++ b/cpp/src/reductions/reductions.cpp
@@ -49,7 +49,7 @@ struct reduce_dispatch_functor {
   }
 
   template <aggregation::Kind k>
-  std::unique_ptr<scalar> operator()(std::unique_ptr<reduce_aggregation> const& agg)
+  std::unique_ptr<scalar> operator()(reduce_aggregation const& agg)
   {
     switch (k) {
       case aggregation::SUM: return reduction::sum(col, output_dtype, init, stream, mr);
@@ -62,12 +62,12 @@ struct reduce_dispatch_functor {
         return reduction::sum_of_squares(col, output_dtype, stream, mr);
       case aggregation::MEAN: return reduction::mean(col, output_dtype, stream, mr);
       case aggregation::VARIANCE: {
-        auto var_agg = dynamic_cast<var_aggregation const*>(agg.get());
-        return reduction::variance(col, output_dtype, var_agg->_ddof, stream, mr);
+        auto var_agg = static_cast<var_aggregation const&>(agg);
+        return reduction::variance(col, output_dtype, var_agg._ddof, stream, mr);
       }
       case aggregation::STD: {
-        auto var_agg = dynamic_cast<std_aggregation const*>(agg.get());
-        return reduction::standard_deviation(col, output_dtype, var_agg->_ddof, stream, mr);
+        auto var_agg = static_cast<std_aggregation const&>(agg);
+        return reduction::standard_deviation(col, output_dtype, var_agg._ddof, stream, mr);
       }
       case aggregation::MEDIAN: {
         auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream);
@@ -78,60 +78,59 @@ struct reduce_dispatch_functor {
         return get_element(*col_ptr, 0, stream, mr);
       }
       case aggregation::QUANTILE: {
-        auto quantile_agg = dynamic_cast<quantile_aggregation const*>(agg.get());
-        CUDF_EXPECTS(quantile_agg->_quantiles.size() == 1,
+        auto quantile_agg = static_cast<quantile_aggregation const&>(agg);
+        CUDF_EXPECTS(quantile_agg._quantiles.size() == 1,
                      "Reduction quantile accepts only one quantile value");
         auto sorted_indices = sorted_order(table_view{{col}}, {}, {null_order::AFTER}, stream);
         auto valid_sorted_indices =
           split(*sorted_indices, {col.size() - col.null_count()}, stream)[0];
 
         auto col_ptr = quantile(col,
-                                quantile_agg->_quantiles,
-                                quantile_agg->_interpolation,
+                                quantile_agg._quantiles,
+                                quantile_agg._interpolation,
                                 valid_sorted_indices,
                                 true,
                                 stream);
         return get_element(*col_ptr, 0, stream, mr);
       }
       case aggregation::NUNIQUE: {
-        auto nunique_agg = dynamic_cast<nunique_aggregation const*>(agg.get());
+        auto nunique_agg = static_cast<nunique_aggregation const&>(agg);
         return make_fixed_width_scalar(
-          detail::distinct_count(
-            col, nunique_agg->_null_handling, nan_policy::NAN_IS_VALID, stream),
+          detail::distinct_count(col, nunique_agg._null_handling, nan_policy::NAN_IS_VALID, stream),
           stream,
           mr);
       }
       case aggregation::NTH_ELEMENT: {
-        auto nth_agg = dynamic_cast<nth_element_aggregation const*>(agg.get());
-        return reduction::nth_element(col, nth_agg->_n, nth_agg->_null_handling, stream, mr);
+        auto nth_agg = static_cast<nth_element_aggregation const&>(agg);
+        return reduction::nth_element(col, nth_agg._n, nth_agg._null_handling, stream, mr);
       }
       case aggregation::COLLECT_LIST: {
-        auto col_agg = dynamic_cast<collect_list_aggregation const*>(agg.get());
-        return reduction::collect_list(col, col_agg->_null_handling, stream, mr);
+        auto col_agg = static_cast<collect_list_aggregation const&>(agg);
+        return reduction::collect_list(col, col_agg._null_handling, stream, mr);
       }
       case aggregation::COLLECT_SET: {
-        auto col_agg = dynamic_cast<collect_set_aggregation const*>(agg.get());
+        auto col_agg = static_cast<collect_set_aggregation const&>(agg);
         return reduction::collect_set(
-          col, col_agg->_null_handling, col_agg->_nulls_equal, col_agg->_nans_equal, stream, mr);
+          col, col_agg._null_handling, col_agg._nulls_equal, col_agg._nans_equal, stream, mr);
       }
       case aggregation::MERGE_LISTS: {
         return reduction::merge_lists(col, stream, mr);
       }
       case aggregation::MERGE_SETS: {
-        auto col_agg = dynamic_cast<merge_sets_aggregation const*>(agg.get());
-        return reduction::merge_sets(col, col_agg->_nulls_equal, col_agg->_nans_equal, stream, mr);
+        auto col_agg = static_cast<merge_sets_aggregation const&>(agg);
+        return reduction::merge_sets(col, col_agg._nulls_equal, col_agg._nans_equal, stream, mr);
       }
       case aggregation::TDIGEST: {
         CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT,
                      "Tdigest aggregations expect output type to be STRUCT");
-        auto td_agg = dynamic_cast<tdigest_aggregation const*>(agg.get());
-        return detail::tdigest::reduce_tdigest(col, td_agg->max_centroids, stream, mr);
+        auto td_agg = static_cast<tdigest_aggregation const&>(agg);
+        return tdigest::detail::reduce_tdigest(col, td_agg.max_centroids, stream, mr);
       }
       case aggregation::MERGE_TDIGEST: {
         CUDF_EXPECTS(output_dtype.id() == type_id::STRUCT,
                      "Tdigest aggregations expect output type to be STRUCT");
-        auto td_agg = dynamic_cast<merge_tdigest_aggregation const*>(agg.get());
-        return detail::tdigest::reduce_merge_tdigest(col, td_agg->max_centroids, stream, mr);
+        auto td_agg = static_cast<merge_tdigest_aggregation const&>(agg);
+        return tdigest::detail::reduce_merge_tdigest(col, td_agg.max_centroids, stream, mr);
       }
       default: CUDF_FAIL("Unsupported reduction operator");
     }
@@ -140,25 +139,25 @@ struct reduce_dispatch_functor {
 
 std::unique_ptr<scalar> reduce(
   column_view const& col,
-  std::unique_ptr<reduce_aggregation> const& agg,
+  reduce_aggregation const& agg,
   data_type output_dtype,
   std::optional<std::reference_wrapper<scalar const>> init,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   CUDF_EXPECTS(!init.has_value() || col.type() == init.value().get().type(),
                "column and initial value must be the same type");
-  if (init.has_value() && !(agg->kind == aggregation::SUM || agg->kind == aggregation::PRODUCT ||
-                            agg->kind == aggregation::MIN || agg->kind == aggregation::MAX ||
-                            agg->kind == aggregation::ANY || agg->kind == aggregation::ALL)) {
+  if (init.has_value() && !(agg.kind == aggregation::SUM || agg.kind == aggregation::PRODUCT ||
+                            agg.kind == aggregation::MIN || agg.kind == aggregation::MAX ||
+                            agg.kind == aggregation::ANY || agg.kind == aggregation::ALL)) {
     CUDF_FAIL(
       "Initial value is only supported for SUM, PRODUCT, MIN, MAX, ANY, and ALL aggregation types");
   }
   // Returns default scalar if input column is non-valid. In terms of nested columns, we need to
   // handcraft the default scalar with input column.
   if (col.size() <= col.null_count()) {
-    if (agg->kind == aggregation::TDIGEST || agg->kind == aggregation::MERGE_TDIGEST) {
-      return detail::tdigest::make_empty_tdigest_scalar();
+    if (agg.kind == aggregation::TDIGEST || agg.kind == aggregation::MERGE_TDIGEST) {
+      return tdigest::detail::make_empty_tdigest_scalar(stream);
     }
     if (col.type().id() == type_id::EMPTY || col.type() != output_dtype) {
       // Under some circumstance, the output type will become the List of input type,
@@ -176,26 +175,26 @@ std::unique_ptr<scalar> reduce(
   }
 
   return aggregation_dispatcher(
-    agg->kind, reduce_dispatch_functor{col, output_dtype, init, stream, mr}, agg);
+    agg.kind, reduce_dispatch_functor{col, output_dtype, init, stream, mr}, agg);
 }
 }  // namespace detail
 
 std::unique_ptr<scalar> reduce(column_view const& col,
-                               std::unique_ptr<reduce_aggregation> const& agg,
+                               reduce_aggregation const& agg,
                                data_type output_dtype,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reduce(col, agg, output_dtype, std::nullopt, cudf::default_stream_value, mr);
+  return detail::reduce(col, agg, output_dtype, std::nullopt, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<scalar> reduce(column_view const& col,
-                               std::unique_ptr<reduce_aggregation> const& agg,
+                               reduce_aggregation const& agg,
                                data_type output_dtype,
                                std::optional<std::reference_wrapper<scalar const>> init,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::reduce(col, agg, output_dtype, init, cudf::default_stream_value, mr);
+  return detail::reduce(col, agg, output_dtype, init, cudf::get_default_stream(), mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cpp b/cpp/src/reductions/scan/scan.cpp
index 85c0f7ea13f..2871ee283ba 100644
--- a/cpp/src/reductions/scan/scan.cpp
+++ b/cpp/src/reductions/scan/scan.cpp
@@ -25,16 +25,16 @@ namespace cudf {
 
 namespace detail {
 std::unique_ptr<column> scan(column_view const& input,
-                             std::unique_ptr<scan_aggregation> const& agg,
+                             scan_aggregation const& agg,
                              scan_type inclusive,
                              null_policy null_handling,
                              rmm::cuda_stream_view stream,
                              rmm::mr::device_memory_resource* mr)
 {
-  if (agg->kind == aggregation::RANK) {
+  if (agg.kind == aggregation::RANK) {
     CUDF_EXPECTS(inclusive == scan_type::INCLUSIVE,
                  "Rank aggregation operator requires an inclusive scan");
-    auto const& rank_agg = dynamic_cast<cudf::detail::rank_aggregation const&>(*agg);
+    auto const& rank_agg = static_cast<cudf::detail::rank_aggregation const&>(agg);
     if (rank_agg._method == rank_method::MIN) {
       if (rank_agg._percentage == rank_percentage::NONE) {
         return inclusive_rank_scan(input, stream, mr);
@@ -55,13 +55,13 @@ std::unique_ptr<column> scan(column_view const& input,
 }  // namespace detail
 
 std::unique_ptr<column> scan(column_view const& input,
-                             std::unique_ptr<scan_aggregation> const& agg,
+                             scan_aggregation const& agg,
                              scan_type inclusive,
                              null_policy null_handling,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::scan(input, agg, inclusive, null_handling, cudf::default_stream_value, mr);
+  return detail::scan(input, agg, inclusive, null_handling, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reductions/scan/scan.cuh b/cpp/src/reductions/scan/scan.cuh
index 127f2ae95b4..2ad6124cdd0 100644
--- a/cpp/src/reductions/scan/scan.cuh
+++ b/cpp/src/reductions/scan/scan.cuh
@@ -35,12 +35,12 @@ rmm::device_buffer mask_scan(column_view const& input_view,
 
 template <template <typename> typename DispatchFn>
 std::unique_ptr<column> scan_agg_dispatch(const column_view& input,
-                                          std::unique_ptr<scan_aggregation> const& agg,
+                                          scan_aggregation const& agg,
                                           null_policy null_handling,
                                           rmm::cuda_stream_view stream,
                                           rmm::mr::device_memory_resource* mr)
 {
-  switch (agg->kind) {
+  switch (agg.kind) {
     case aggregation::SUM:
       return type_dispatcher<dispatch_storage_type>(
         input.type(), DispatchFn<DeviceSum>(), input, null_handling, stream, mr);
diff --git a/cpp/src/reductions/scan/scan_exclusive.cu b/cpp/src/reductions/scan/scan_exclusive.cu
index 885d7e904b4..0c2973d63b1 100644
--- a/cpp/src/reductions/scan/scan_exclusive.cu
+++ b/cpp/src/reductions/scan/scan_exclusive.cu
@@ -81,7 +81,7 @@ struct scan_dispatcher {
 }  // namespace
 
 std::unique_ptr<column> scan_exclusive(const column_view& input,
-                                       std::unique_ptr<scan_aggregation> const& agg,
+                                       scan_aggregation const& agg,
                                        null_policy null_handling,
                                        rmm::cuda_stream_view stream,
                                        rmm::mr::device_memory_resource* mr)
diff --git a/cpp/src/reductions/scan/scan_inclusive.cu b/cpp/src/reductions/scan/scan_inclusive.cu
index df5b5008e5b..99c67a563e4 100644
--- a/cpp/src/reductions/scan/scan_inclusive.cu
+++ b/cpp/src/reductions/scan/scan_inclusive.cu
@@ -248,7 +248,7 @@ struct scan_dispatcher {
 
 std::unique_ptr<column> scan_inclusive(
   column_view const& input,
-  std::unique_ptr<scan_aggregation> const& agg,
+  scan_aggregation const& agg,
   null_policy null_handling,
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
diff --git a/cpp/src/reductions/segmented_reductions.cpp b/cpp/src/reductions/segmented_reductions.cpp
index d87644e7126..04a83217469 100644
--- a/cpp/src/reductions/segmented_reductions.cpp
+++ b/cpp/src/reductions/segmented_reductions.cpp
@@ -133,7 +133,7 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                   output_dtype,
                                   null_handling,
                                   std::nullopt,
-                                  cudf::default_stream_value,
+                                  cudf::get_default_stream(),
                                   mr);
 }
 
@@ -152,7 +152,7 @@ std::unique_ptr<column> segmented_reduce(column_view const& segmented_values,
                                   output_dtype,
                                   null_handling,
                                   init,
-                                  cudf::default_stream_value,
+                                  cudf::get_default_stream(),
                                   mr);
 }
 
diff --git a/cpp/src/replace/clamp.cu b/cpp/src/replace/clamp.cu
index f5e0ca3b3ef..d54ebf25494 100644
--- a/cpp/src/replace/clamp.cu
+++ b/cpp/src/replace/clamp.cu
@@ -300,14 +300,17 @@ std::unique_ptr<column> dispatch_clamp::operator()<cudf::dictionary32>(
     return result;
   }();
   auto matched_view = dictionary_column_view(matched_column->view());
+  auto default_mr   = rmm::mr::get_current_device_resource();
 
   // get the indexes for lo_replace and for hi_replace
-  auto lo_replace_index = dictionary::detail::get_index(matched_view, lo_replace, stream);
-  auto hi_replace_index = dictionary::detail::get_index(matched_view, hi_replace, stream);
+  auto lo_replace_index =
+    dictionary::detail::get_index(matched_view, lo_replace, stream, default_mr);
+  auto hi_replace_index =
+    dictionary::detail::get_index(matched_view, hi_replace, stream, default_mr);
 
   // get the closest indexes for lo and for hi
-  auto lo_index = dictionary::detail::get_insert_index(matched_view, lo, stream);
-  auto hi_index = dictionary::detail::get_insert_index(matched_view, hi, stream);
+  auto lo_index = dictionary::detail::get_insert_index(matched_view, lo, stream, default_mr);
+  auto hi_index = dictionary::detail::get_insert_index(matched_view, hi, stream, default_mr);
 
   // call clamp with the scalar indexes and the matched indices
   auto matched_indices = matched_view.get_indices_annotated();
@@ -391,7 +394,7 @@ std::unique_ptr<column> clamp(column_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::clamp(input, lo, lo_replace, hi, hi_replace, cudf::default_stream_value, mr);
+  return detail::clamp(input, lo, lo_replace, hi, hi_replace, cudf::get_default_stream(), mr);
 }
 
 // clamp input at lo and hi
@@ -401,6 +404,6 @@ std::unique_ptr<column> clamp(column_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::clamp(input, lo, lo, hi, hi, cudf::default_stream_value, mr);
+  return detail::clamp(input, lo, lo, hi, hi, cudf::get_default_stream(), mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/replace/nans.cu b/cpp/src/replace/nans.cu
index 47776422adb..ce0d2d07b36 100644
--- a/cpp/src/replace/nans.cu
+++ b/cpp/src/replace/nans.cu
@@ -114,7 +114,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nans(input, replacement, cudf::default_stream_value, mr);
+  return detail::replace_nans(input, replacement, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> replace_nans(column_view const& input,
@@ -122,7 +122,7 @@ std::unique_ptr<column> replace_nans(column_view const& input,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nans(input, replacement, cudf::default_stream_value, mr);
+  return detail::replace_nans(input, replacement, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
@@ -224,7 +224,7 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_nans_and_zeros(input, cudf::default_stream_value, mr);
+  return detail::normalize_nans_and_zeros(input, cudf::get_default_stream(), mr);
 }
 
 /**
@@ -240,7 +240,7 @@ std::unique_ptr<column> normalize_nans_and_zeros(column_view const& input,
 void normalize_nans_and_zeros(mutable_column_view& in_out)
 {
   CUDF_FUNC_RANGE();
-  detail::normalize_nans_and_zeros(in_out, cudf::default_stream_value);
+  detail::normalize_nans_and_zeros(in_out, cudf::get_default_stream());
 }
 
 }  // namespace cudf
diff --git a/cpp/src/replace/nulls.cu b/cpp/src/replace/nulls.cu
index 232392db0c6..d2d524ef9ba 100644
--- a/cpp/src/replace/nulls.cu
+++ b/cpp/src/replace/nulls.cu
@@ -453,7 +453,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nulls(input, replacement, cudf::default_stream_value, mr);
+  return detail::replace_nulls(input, replacement, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
@@ -461,7 +461,7 @@ std::unique_ptr<cudf::column> replace_nulls(cudf::column_view const& input,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nulls(input, replacement, cudf::default_stream_value, mr);
+  return detail::replace_nulls(input, replacement, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<cudf::column> replace_nulls(column_view const& input,
@@ -469,7 +469,7 @@ std::unique_ptr<cudf::column> replace_nulls(column_view const& input,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_nulls(input, replace_policy, cudf::default_stream_value, mr);
+  return detail::replace_nulls(input, replace_policy, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/replace/replace.cu b/cpp/src/replace/replace.cu
index b6048333bc9..b3ee6e069ed 100644
--- a/cpp/src/replace/replace.cu
+++ b/cpp/src/replace/replace.cu
@@ -457,9 +457,10 @@ std::unique_ptr<cudf::column> replace_kernel_forwarder::operator()<cudf::diction
     return cudf::dictionary::detail::add_keys(input, new_keys->view(), stream, mr);
   }();
   auto matched_view   = cudf::dictionary_column_view(matched_input->view());
-  auto matched_values = cudf::dictionary::detail::set_keys(values, matched_view.keys(), stream);
-  auto matched_replacements =
-    cudf::dictionary::detail::set_keys(replacements, matched_view.keys(), stream);
+  auto matched_values = cudf::dictionary::detail::set_keys(
+    values, matched_view.keys(), stream, rmm::mr::get_current_device_resource());
+  auto matched_replacements = cudf::dictionary::detail::set_keys(
+    replacements, matched_view.keys(), stream, rmm::mr::get_current_device_resource());
 
   auto indices_type = matched_view.indices().type();
   auto new_indices  = cudf::type_dispatcher<cudf::dispatch_storage_type>(
@@ -531,6 +532,6 @@ std::unique_ptr<cudf::column> find_and_replace_all(cudf::column_view const& inpu
                                                    rmm::mr::device_memory_resource* mr)
 {
   return detail::find_and_replace_all(
-    input_col, values_to_replace, replacement_values, cudf::default_stream_value, mr);
+    input_col, values_to_replace, replacement_values, cudf::get_default_stream(), mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/reshape/byte_cast.cu b/cpp/src/reshape/byte_cast.cu
index 639ddb33e9a..227ad2dad9c 100644
--- a/cpp/src/reshape/byte_cast.cu
+++ b/cpp/src/reshape/byte_cast.cu
@@ -101,11 +101,21 @@ std::unique_ptr<cudf::column> byte_list_conversion::operator()<string_view>(
   auto strings_count = input_strings.size();
   if (strings_count == 0) return cudf::empty_like(input_column);
 
-  auto contents = std::make_unique<column>(input_column, stream, mr)->release();
+  auto col_content = std::make_unique<column>(input_column, stream, mr)->release();
+  auto contents =
+    col_content.children[strings_column_view::chars_column_index].release()->release();
+  auto data      = contents.data.release();
+  auto null_mask = contents.null_mask.release();
+  auto uint8_col = std::make_unique<column>(data_type{type_id::UINT8},
+                                            data->size(),
+                                            std::move(*data),
+                                            std::move(*null_mask),
+                                            UNKNOWN_NULL_COUNT);
+
   return make_lists_column(
     input_column.size(),
-    std::move(contents.children[cudf::strings_column_view::offsets_column_index]),
-    std::move(contents.children[cudf::strings_column_view::chars_column_index]),
+    std::move(col_content.children[cudf::strings_column_view::offsets_column_index]),
+    std::move(uint8_col),
     input_column.null_count(),
     detail::copy_bitmask(input_column, stream, mr),
     stream,
@@ -137,7 +147,7 @@ std::unique_ptr<column> byte_cast(column_view const& input_column,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::byte_cast(input_column, endian_configuration, cudf::default_stream_value, mr);
+  return detail::byte_cast(input_column, endian_configuration, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/interleave_columns.cu b/cpp/src/reshape/interleave_columns.cu
index 3a3397dc1d5..bf316ea20bf 100644
--- a/cpp/src/reshape/interleave_columns.cu
+++ b/cpp/src/reshape/interleave_columns.cu
@@ -294,7 +294,7 @@ std::unique_ptr<column> interleave_columns(table_view const& input,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::interleave_columns(input, cudf::default_stream_value, mr);
+  return detail::interleave_columns(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/reshape/tile.cu b/cpp/src/reshape/tile.cu
index 95358ddab01..18174ef1001 100644
--- a/cpp/src/reshape/tile.cu
+++ b/cpp/src/reshape/tile.cu
@@ -65,7 +65,7 @@ std::unique_ptr<table> tile(const table_view& in,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tile(in, count, cudf::default_stream_value, mr);
+  return detail::tile(in, count, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/rolling/detail/lead_lag_nested.cuh b/cpp/src/rolling/detail/lead_lag_nested.cuh
index a23786ec7f3..859ed7e5d53 100644
--- a/cpp/src/rolling/detail/lead_lag_nested.cuh
+++ b/cpp/src/rolling/detail/lead_lag_nested.cuh
@@ -198,7 +198,6 @@ std::unique_ptr<column> compute_lead_lag_for_nested(aggregation::Kind op,
     table_view{std::vector<column_view>{gathered_defaults->release()[0]->view()}},
     scatter_map,
     table_view{std::vector<column_view>{output_with_nulls->release()[0]->view()}},
-    false,
     stream,
     mr);
   return std::move(scattered_results->release()[0]);
diff --git a/cpp/src/rolling/detail/range_window_bounds.hpp b/cpp/src/rolling/detail/range_window_bounds.hpp
index 266f397b1e3..506bd54e5eb 100644
--- a/cpp/src/rolling/detail/range_window_bounds.hpp
+++ b/cpp/src/rolling/detail/range_window_bounds.hpp
@@ -149,7 +149,7 @@ template <typename OrderByType>
 range_rep_type<OrderByType> range_comparable_value(
   range_window_bounds const& range_bounds,
   data_type const& order_by_data_type = data_type{type_to_id<OrderByType>()},
-  rmm::cuda_stream_view stream        = cudf::default_stream_value)
+  rmm::cuda_stream_view stream        = cudf::get_default_stream())
 {
   auto const& range_scalar = range_bounds.range_scalar();
   using range_type         = cudf::detail::range_type<OrderByType>;
diff --git a/cpp/src/rolling/detail/rolling.cuh b/cpp/src/rolling/detail/rolling.cuh
index 4394557e453..68480dbf773 100644
--- a/cpp/src/rolling/detail/rolling.cuh
+++ b/cpp/src/rolling/detail/rolling.cuh
@@ -122,10 +122,8 @@ struct DeviceRolling {
     using AggOp = typename corresponding_operator<op>::type;
     AggOp agg_op;
 
-    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-    // for CUDA 10.0 and below (fixed in CUDA 10.1)
-    volatile cudf::size_type count = 0;
-    OutputType val                 = AggOp::template identity<OutputType>();
+    cudf::size_type count = 0;
+    OutputType val        = AggOp::template identity<OutputType>();
 
     for (size_type j = start_index; j < end_index; j++) {
       if (!has_nulls || input.is_valid(j)) {
@@ -190,11 +188,9 @@ struct DeviceRollingArgMinMaxString : DeviceRollingArgMinMaxBase<cudf::string_vi
     using AggOp     = typename corresponding_operator<op>::type;
     AggOp agg_op;
 
-    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-    // for CUDA 10.0 and below (fixed in CUDA 10.1)
-    volatile cudf::size_type count = 0;
-    InputType val                  = AggOp::template identity<InputType>();
-    OutputType val_index           = default_output;
+    cudf::size_type count = 0;
+    InputType val         = AggOp::template identity<InputType>();
+    OutputType val_index  = default_output;
 
     for (size_type j = start_index; j < end_index; j++) {
       if (!has_nulls || input.is_valid(j)) {
@@ -284,13 +280,11 @@ struct DeviceRollingCountValid {
                              size_type end_index,
                              size_type current_index)
   {
-    // declare this as volatile to avoid some compiler optimizations that lead to incorrect
-    // results for CUDA 10.0 and below (fixed in CUDA 10.1)
-    volatile cudf::size_type count = 0;
-
     bool output_is_valid = ((end_index - start_index) >= min_periods);
 
     if (output_is_valid) {
+      cudf::size_type count = 0;
+
       if (!has_nulls) {
         count = end_index - start_index;
       } else {
diff --git a/cpp/src/rolling/grouped_rolling.cu b/cpp/src/rolling/grouped_rolling.cu
index c1be33a9cd5..960dbfb9dfe 100644
--- a/cpp/src/rolling/grouped_rolling.cu
+++ b/cpp/src/rolling/grouped_rolling.cu
@@ -211,7 +211,7 @@ std::unique_ptr<column> grouped_rolling_window(table_view const& group_keys,
                                         following_window_bounds,
                                         min_periods,
                                         aggr,
-                                        cudf::default_stream_value,
+                                        cudf::get_default_stream(),
                                         mr);
 }
 
@@ -1049,7 +1049,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                               following,
                                               min_periods,
                                               aggr,
-                                              cudf::default_stream_value,
+                                              cudf::get_default_stream(),
                                               mr);
 }
 
@@ -1089,7 +1089,7 @@ std::unique_ptr<column> grouped_time_range_rolling_window(table_view const& grou
                                               following,
                                               min_periods,
                                               aggr,
-                                              cudf::default_stream_value,
+                                              cudf::get_default_stream(),
                                               mr);
 }
 
@@ -1124,7 +1124,7 @@ std::unique_ptr<column> grouped_range_rolling_window(table_view const& group_key
                                               following,
                                               min_periods,
                                               aggr,
-                                              cudf::default_stream_value,
+                                              cudf::get_default_stream(),
                                               mr);
 }
 
diff --git a/cpp/src/rolling/jit/kernel.cu b/cpp/src/rolling/jit/kernel.cu
index ecdbbb6a0f2..3bfee32d1cc 100644
--- a/cpp/src/rolling/jit/kernel.cu
+++ b/cpp/src/rolling/jit/kernel.cu
@@ -58,10 +58,6 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
 
   auto active_threads = __ballot_sync(0xffff'ffffu, i < nrows);
   while (i < nrows) {
-    // declare this as volatile to avoid some compiler optimizations that lead to incorrect results
-    // for CUDA 10.0 and below (fixed in CUDA 10.1)
-    volatile cudf::size_type count = 0;
-
     int64_t const preceding_window = get_window(preceding_window_begin, i);
     int64_t const following_window = get_window(following_window_begin, i);
 
@@ -77,8 +73,8 @@ __global__ void gpu_rolling_new(cudf::size_type nrows,
     // TODO: We should explore using shared memory to avoid redundant loads.
     //       This might require separating the kernel into a special version
     //       for dynamic and static sizes.
-    count       = end_index - start_index;
-    OutType val = agg_op::template operate<OutType, InType>(in_col, start_index, count);
+    cudf::size_type count = end_index - start_index;
+    OutType val           = agg_op::template operate<OutType, InType>(in_col, start_index, count);
 
     // check if we have enough input samples
     bool const output_is_valid = (count >= min_periods);
diff --git a/cpp/src/rolling/rolling.cu b/cpp/src/rolling/rolling.cu
index f11eaad351d..d699d7bea85 100644
--- a/cpp/src/rolling/rolling.cu
+++ b/cpp/src/rolling/rolling.cu
@@ -41,7 +41,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                 following_window,
                                 min_periods,
                                 agg,
-                                cudf::default_stream_value,
+                                cudf::get_default_stream(),
                                 mr);
 }
 
@@ -62,7 +62,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
                                 following_window,
                                 min_periods,
                                 agg,
-                                cudf::default_stream_value,
+                                cudf::get_default_stream(),
                                 mr);
 }
 
@@ -76,7 +76,7 @@ std::unique_ptr<column> rolling_window(column_view const& input,
 {
   CUDF_FUNC_RANGE();
   return detail::rolling_window(
-    input, preceding_window, following_window, min_periods, agg, cudf::default_stream_value, mr);
+    input, preceding_window, following_window, min_periods, agg, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/round/round.cu b/cpp/src/round/round.cu
index c60ce7295fb..58e21fc97ab 100644
--- a/cpp/src/round/round.cu
+++ b/cpp/src/round/round.cu
@@ -348,7 +348,7 @@ std::unique_ptr<column> round(column_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::round(input, decimal_places, method, cudf::default_stream_value, mr);
+  return detail::round(input, decimal_places, method, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/scalar/scalar.cpp b/cpp/src/scalar/scalar.cpp
index 2af3867cca7..0fe04ed1305 100644
--- a/cpp/src/scalar/scalar.cpp
+++ b/cpp/src/scalar/scalar.cpp
@@ -109,7 +109,7 @@ size_type string_scalar::size() const { return _data.size(); }
 
 const char* string_scalar::data() const { return static_cast<const char*>(_data.data()); }
 
-string_scalar::operator std::string() const { return this->to_string(cudf::default_stream_value); }
+string_scalar::operator std::string() const { return this->to_string(cudf::get_default_stream()); }
 
 std::string string_scalar::to_string(rmm::cuda_stream_view stream) const
 {
@@ -186,7 +186,7 @@ T fixed_point_scalar<T>::fixed_point_value(rmm::cuda_stream_view stream) const
 template <typename T>
 fixed_point_scalar<T>::operator value_type() const
 {
-  return this->fixed_point_value(cudf::default_stream_value);
+  return this->fixed_point_value(cudf::get_default_stream());
 }
 
 template <typename T>
@@ -269,7 +269,7 @@ T const* fixed_width_scalar<T>::data() const
 template <typename T>
 fixed_width_scalar<T>::operator value_type() const
 {
-  return this->value(cudf::default_stream_value);
+  return this->value(cudf::get_default_stream());
 }
 
 /**
diff --git a/cpp/src/search/contains_column.cu b/cpp/src/search/contains_column.cu
index c7631385270..08bcf8d48d8 100644
--- a/cpp/src/search/contains_column.cu
+++ b/cpp/src/search/contains_column.cu
@@ -119,9 +119,11 @@ std::unique_ptr<column> contains_column_dispatch::operator()<dictionary32>(
   dictionary_column_view const haystack(haystack_in);
   dictionary_column_view const needles(needles_in);
   // first combine keys so both dictionaries have the same set
-  auto needles_matched     = dictionary::detail::add_keys(needles, haystack.keys(), stream);
-  auto const needles_view  = dictionary_column_view(needles_matched->view());
-  auto haystack_matched    = dictionary::detail::set_keys(haystack, needles_view.keys(), stream);
+  auto needles_matched = dictionary::detail::add_keys(
+    needles, haystack.keys(), stream, rmm::mr::get_current_device_resource());
+  auto const needles_view = dictionary_column_view(needles_matched->view());
+  auto haystack_matched   = dictionary::detail::set_keys(
+    haystack, needles_view.keys(), stream, rmm::mr::get_current_device_resource());
   auto const haystack_view = dictionary_column_view(haystack_matched->view());
 
   // now just use the indices for the contains
@@ -155,7 +157,7 @@ std::unique_ptr<column> contains(column_view const& haystack,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(haystack, needles, cudf::default_stream_value, mr);
+  return detail::contains(haystack, needles, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/search/contains_scalar.cu b/cpp/src/search/contains_scalar.cu
index 11c47c769fb..8c500e1e757 100644
--- a/cpp/src/search/contains_scalar.cu
+++ b/cpp/src/search/contains_scalar.cu
@@ -128,7 +128,8 @@ bool contains_scalar_dispatch::operator()<cudf::dictionary32>(column_view const&
 {
   auto const dict_col = cudf::dictionary_column_view(haystack);
   // first, find the needle in the dictionary's key set
-  auto const index = cudf::dictionary::detail::get_index(dict_col, needle, stream);
+  auto const index = cudf::dictionary::detail::get_index(
+    dict_col, needle, stream, rmm::mr::get_current_device_resource());
   // if found, check the index is actually in the indices column
   return index->is_valid(stream) && cudf::type_dispatcher(dict_col.indices().type(),
                                                           contains_scalar_dispatch{},
@@ -153,7 +154,7 @@ bool contains(column_view const& haystack, scalar const& needle, rmm::cuda_strea
 bool contains(column_view const& haystack, scalar const& needle)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(haystack, needle, cudf::default_stream_value);
+  return detail::contains(haystack, needle, cudf::get_default_stream());
 }
 
 }  // namespace cudf
diff --git a/cpp/src/search/search_ordered.cu b/cpp/src/search/search_ordered.cu
index 8d3b0f97726..bf0eb8d46f8 100644
--- a/cpp/src/search/search_ordered.cu
+++ b/cpp/src/search/search_ordered.cu
@@ -19,7 +19,6 @@
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/dictionary/detail/update_keys.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
@@ -62,7 +61,8 @@ std::unique_ptr<column> search_ordered(table_view const& haystack,
 
   // This utility will ensure all corresponding dictionary columns have matching keys.
   // It will return any new dictionary columns created as well as updated table_views.
-  auto const matched = dictionary::detail::match_dictionaries({haystack, needles}, stream);
+  auto const matched = dictionary::detail::match_dictionaries(
+    {haystack, needles}, stream, rmm::mr::get_current_device_resource());
   auto const& matched_haystack = matched.second.front();
   auto const& matched_needles  = matched.second.back();
 
@@ -148,7 +148,7 @@ std::unique_ptr<column> lower_bound(table_view const& haystack,
 {
   CUDF_FUNC_RANGE();
   return detail::lower_bound(
-    haystack, needles, column_order, null_precedence, cudf::default_stream_value, mr);
+    haystack, needles, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> upper_bound(table_view const& haystack,
@@ -159,7 +159,7 @@ std::unique_ptr<column> upper_bound(table_view const& haystack,
 {
   CUDF_FUNC_RANGE();
   return detail::upper_bound(
-    haystack, needles, column_order, null_precedence, cudf::default_stream_value, mr);
+    haystack, needles, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/is_sorted.cu b/cpp/src/sort/is_sorted.cu
index 43f0a17ab27..459dcf5467f 100644
--- a/cpp/src/sort/is_sorted.cu
+++ b/cpp/src/sort/is_sorted.cu
@@ -84,7 +84,7 @@ bool is_sorted(cudf::table_view const& in,
   }
 
   return detail::is_sorted(
-    in, column_order, has_nulls(in), null_precedence, cudf::default_stream_value);
+    in, column_order, has_nulls(in), null_precedence, cudf::get_default_stream());
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/rank.cu b/cpp/src/sort/rank.cu
index bcb9244231d..99e99704c10 100644
--- a/cpp/src/sort/rank.cu
+++ b/cpp/src/sort/rank.cu
@@ -352,7 +352,7 @@ std::unique_ptr<column> rank(column_view const& input,
                       null_handling,
                       null_precedence,
                       percentage,
-                      cudf::default_stream_value,
+                      cudf::get_default_stream(),
                       mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/sort/segmented_sort.cu b/cpp/src/sort/segmented_sort.cu
index 3422330bf8b..dc87d5ea326 100644
--- a/cpp/src/sort/segmented_sort.cu
+++ b/cpp/src/sort/segmented_sort.cu
@@ -14,8 +14,11 @@
  * limitations under the License.
  */
 
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/copy.hpp>
 #include <cudf/detail/gather.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/detail/sequence.hpp>
 #include <cudf/detail/sorting.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
@@ -24,7 +27,8 @@
 
 #include <thrust/binary_search.h>
 #include <thrust/iterator/counting_iterator.h>
-#include <thrust/iterator/transform_iterator.h>
+
+#include <cub/device/device_segmented_sort.cuh>
 
 namespace cudf {
 namespace detail {
@@ -35,24 +39,173 @@ namespace {
  */
 enum class sort_method { STABLE, UNSTABLE };
 
-// returns segment indices for each element for all segments.
-// first segment begin index = 0, last segment end index = num_rows.
+/**
+ * @brief Functor performs faster segmented sort on eligible columns
+ */
+struct column_fast_sort_fn {
+  /**
+   * @brief Run-time check for faster segmented sort on an eligible column
+   *
+   * Fast segmented sort can handle integral types including
+   * decimal types if dispatch_storage_type is used but it does not support int128.
+   */
+  static bool is_fast_sort_supported(column_view const& col)
+  {
+    return !col.has_nulls() and
+           ((cudf::is_integral(col.type()) && !cudf::is_boolean(col.type())) ||
+            (cudf::is_fixed_point(col.type()) and (col.type().id() != type_id::DECIMAL128)));
+  }
+
+  /**
+   * @brief Compile-time check for supporting fast segmented sort for a specific type
+   *
+   * The dispatch_storage_type means we can check for integral types to
+   * include fixed-point types but the CUB limitation means we need to exclude int128.
+   */
+  template <typename T>
+  static constexpr bool is_fast_sort_supported()
+  {
+    return cudf::is_integral<T>() and !std::is_same_v<__int128, T>;
+  }
+
+  template <typename T>
+  void fast_sort(column_view const& input,
+                 column_view const& segment_offsets,
+                 mutable_column_view& indices,
+                 bool ascending,
+                 rmm::cuda_stream_view stream)
+  {
+    // CUB's segmented sort functions cannot accept iterators.
+    // We create a temporary column here for it to use.
+    auto temp_col =
+      cudf::detail::allocate_like(input, input.size(), mask_allocation_policy::NEVER, stream);
+    mutable_column_view output_view = temp_col->mutable_view();
+
+    // DeviceSegmentedSort is faster then DeviceSegmentedRadixSort at this time
+    auto fast_sort_impl = [stream](bool ascending, [[maybe_unused]] auto&&... args) {
+      rmm::device_buffer d_temp_storage;
+      size_t temp_storage_bytes = 0;
+      if (ascending) {
+        cub::DeviceSegmentedSort::SortPairs(
+          d_temp_storage.data(), temp_storage_bytes, std::forward<decltype(args)>(args)...);
+        d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
+        cub::DeviceSegmentedSort::SortPairs(
+          d_temp_storage.data(), temp_storage_bytes, std::forward<decltype(args)>(args)...);
+      } else {
+        cub::DeviceSegmentedSort::SortPairsDescending(
+          d_temp_storage.data(), temp_storage_bytes, std::forward<decltype(args)>(args)...);
+        d_temp_storage = rmm::device_buffer{temp_storage_bytes, stream};
+        cub::DeviceSegmentedSort::SortPairsDescending(
+          d_temp_storage.data(), temp_storage_bytes, std::forward<decltype(args)>(args)...);
+      }
+    };
+
+    fast_sort_impl(ascending,
+                   input.begin<T>(),
+                   output_view.begin<T>(),
+                   indices.begin<size_type>(),
+                   indices.begin<size_type>(),
+                   input.size(),
+                   segment_offsets.size() - 1,
+                   segment_offsets.begin<size_type>(),
+                   segment_offsets.begin<size_type>() + 1,
+                   stream.value());
+  }
+
+  template <typename T, CUDF_ENABLE_IF(is_fast_sort_supported<T>())>
+  void operator()(column_view const& input,
+                  column_view const& segment_offsets,
+                  mutable_column_view& indices,
+                  bool ascending,
+                  rmm::cuda_stream_view stream)
+  {
+    fast_sort<T>(input, segment_offsets, indices, ascending, stream);
+  }
+
+  template <typename T, CUDF_ENABLE_IF(!is_fast_sort_supported<T>())>
+  void operator()(
+    column_view const&, column_view const&, mutable_column_view&, bool, rmm::cuda_stream_view)
+  {
+    CUDF_FAIL("Column type cannot be used with fast-sort function");
+  }
+};
+
+/**
+ * @brief Performs faster sort on eligible columns
+ *
+ * Check the `is_fast_sort_supported()==true` on the input column before using this function.
+ *
+ * @param input Column to sort
+ * @param segment_offsets Identifies segments to sort within
+ * @param column_order Sort ascending or descending
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ * @param mr Device memory resource used to allocate the returned column's device memory
+ */
+std::unique_ptr<column> fast_segmented_sorted_order(column_view const& input,
+                                                    column_view const& segment_offsets,
+                                                    order const& column_order,
+                                                    rmm::cuda_stream_view stream,
+                                                    rmm::mr::device_memory_resource* mr)
+{
+  // Unfortunately, CUB's segmented sort functions cannot accept iterators.
+  // We have to build a pre-filled sequence of indices as input.
+  auto sorted_indices =
+    cudf::detail::sequence(input.size(), numeric_scalar<size_type>{0}, stream, mr);
+  auto indices_view = sorted_indices->mutable_view();
+
+  cudf::type_dispatcher<dispatch_storage_type>(input.type(),
+                                               column_fast_sort_fn{},
+                                               input,
+                                               segment_offsets,
+                                               indices_view,
+                                               column_order == order::ASCENDING,
+                                               stream);
+  return sorted_indices;
+}
+
+/**
+ * @brief Builds indices to identify segments to sort
+ *
+ * The segments are added to the input table-view keys so they
+ * are lexicographically sorted within the segmented groups.
+ *
+ * ```
+ * Example 1:
+ * num_rows = 10
+ * offsets = {0, 3, 7, 10}
+ * segment-indices -> { 3,3,3, 7,7,7,7, 10,10,10 }
+ * ```
+ *
+ * ```
+ * Example 2: (offsets do not cover all indices)
+ * num_rows = 10
+ * offsets = {3, 7}
+ * segment-indices -> { 0,1,2, 7,7,7,7, 8,9,10 }
+ * ```
+ *
+ * @param num_rows Total number of rows in the input keys to sort
+ * @param offsets The offsets identifying the segments
+ * @param stream CUDA stream used for device memory operations and kernel launches
+ */
 rmm::device_uvector<size_type> get_segment_indices(size_type num_rows,
                                                    column_view const& offsets,
                                                    rmm::cuda_stream_view stream)
 {
   rmm::device_uvector<size_type> segment_ids(num_rows, stream);
 
-  auto offset_begin = offsets.begin<size_type>();  // assumes already offset column contains offset.
-  auto offsets_minus_one = thrust::make_transform_iterator(
-    offset_begin, [offset_begin] __device__(auto i) { return i - 1; });
+  auto offset_begin  = offsets.begin<size_type>();
+  auto offset_end    = offsets.end<size_type>();
   auto counting_iter = thrust::make_counting_iterator<size_type>(0);
-  thrust::lower_bound(rmm::exec_policy(stream),
-                      offsets_minus_one,
-                      offsets_minus_one + offsets.size(),
-                      counting_iter,
-                      counting_iter + segment_ids.size(),
-                      segment_ids.begin());
+  thrust::transform(rmm::exec_policy(stream),
+                    counting_iter,
+                    counting_iter + segment_ids.size(),
+                    segment_ids.begin(),
+                    [offset_begin, offset_end] __device__(auto idx) {
+                      if (offset_begin == offset_end || idx < *offset_begin) { return idx; }
+                      if (idx >= *(offset_end - 1)) { return idx + 1; }
+                      return static_cast<size_type>(
+                        *thrust::upper_bound(thrust::seq, offset_begin, offset_end, idx));
+                    });
   return segment_ids;
 }
 
@@ -65,8 +218,42 @@ std::unique_ptr<column> segmented_sorted_order_common(
   rmm::cuda_stream_view stream,
   rmm::mr::device_memory_resource* mr)
 {
+  if (keys.num_rows() == 0 || keys.num_columns() == 0) {
+    return cudf::make_empty_column(type_to_id<size_type>());
+  }
+
   CUDF_EXPECTS(segment_offsets.type() == data_type(type_to_id<size_type>()),
                "segment offsets should be size_type");
+
+  if (not column_order.empty()) {
+    CUDF_EXPECTS(static_cast<std::size_t>(keys.num_columns()) == column_order.size(),
+                 "Mismatch between number of columns and column order.");
+  }
+
+  if (not null_precedence.empty()) {
+    CUDF_EXPECTS(static_cast<std::size_t>(keys.num_columns()) == null_precedence.size(),
+                 "Mismatch between number of columns and null_precedence size.");
+  }
+
+  // the average row size for which to prefer fast sort
+  constexpr cudf::size_type MAX_AVG_LIST_SIZE_FOR_FAST_SORT{100};
+  // the maximum row count for which to prefer fast sort
+  constexpr cudf::size_type MAX_LIST_SIZE_FOR_FAST_SORT{1 << 18};
+
+  // fast-path for single column sort:
+  // - single-column table
+  // - not stable-sort
+  // - no nulls and allowable fixed-width type
+  // - size and width are limited -- based on benchmark results
+  if (keys.num_columns() == 1 and sorting == sort_method::UNSTABLE and
+      column_fast_sort_fn::is_fast_sort_supported(keys.column(0)) and
+      (segment_offsets.size() > 0) and
+      (((keys.num_rows() / segment_offsets.size()) < MAX_AVG_LIST_SIZE_FOR_FAST_SORT) or
+       (keys.num_rows() < MAX_LIST_SIZE_FOR_FAST_SORT))) {
+    auto const col_order = column_order.empty() ? order::ASCENDING : column_order.front();
+    return fast_segmented_sorted_order(keys.column(0), segment_offsets, col_order, stream, mr);
+  }
+
   // Get segment id of each element in all segments.
   auto segment_ids = get_segment_indices(keys.num_rows(), segment_offsets, stream);
 
@@ -196,7 +383,7 @@ std::unique_ptr<column> segmented_sorted_order(table_view const& keys,
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sorted_order(
-    keys, segment_offsets, column_order, null_precedence, cudf::default_stream_value, mr);
+    keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> stable_segmented_sorted_order(
@@ -208,7 +395,7 @@ std::unique_ptr<column> stable_segmented_sorted_order(
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sorted_order(
-    keys, segment_offsets, column_order, null_precedence, cudf::default_stream_value, mr);
+    keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
@@ -220,7 +407,7 @@ std::unique_ptr<table> segmented_sort_by_key(table_view const& values,
 {
   CUDF_FUNC_RANGE();
   return detail::segmented_sort_by_key(
-    values, keys, segment_offsets, column_order, null_precedence, cudf::default_stream_value, mr);
+    values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
@@ -232,7 +419,7 @@ std::unique_ptr<table> stable_segmented_sort_by_key(table_view const& values,
 {
   CUDF_FUNC_RANGE();
   return detail::stable_segmented_sort_by_key(
-    values, keys, segment_offsets, column_order, null_precedence, cudf::default_stream_value, mr);
+    values, keys, segment_offsets, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/sort.cu b/cpp/src/sort/sort.cu
index 5089f233916..34041bddeb8 100644
--- a/cpp/src/sort/sort.cu
+++ b/cpp/src/sort/sort.cu
@@ -100,7 +100,7 @@ std::unique_ptr<table> sort(table_view const& input,
     return std::make_unique<table>(std::move(columns));
   }
   return detail::sort_by_key(
-    input, input, column_order, null_precedence, cudf::default_stream_value, mr);
+    input, input, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 }  // namespace detail
@@ -111,7 +111,7 @@ std::unique_ptr<column> sorted_order(table_view const& input,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sorted_order(input, column_order, null_precedence, cudf::default_stream_value, mr);
+  return detail::sorted_order(input, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> sort(table_view const& input,
@@ -120,7 +120,7 @@ std::unique_ptr<table> sort(table_view const& input,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::sort(input, column_order, null_precedence, cudf::default_stream_value, mr);
+  return detail::sort(input, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> sort_by_key(table_view const& values,
@@ -131,7 +131,7 @@ std::unique_ptr<table> sort_by_key(table_view const& values,
 {
   CUDF_FUNC_RANGE();
   return detail::sort_by_key(
-    values, keys, column_order, null_precedence, cudf::default_stream_value, mr);
+    values, keys, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/sort_column.cu b/cpp/src/sort/sort_column.cu
index 01ca36874e4..cf8b72f85ad 100644
--- a/cpp/src/sort/sort_column.cu
+++ b/cpp/src/sort/sort_column.cu
@@ -14,108 +14,16 @@
  * limitations under the License.
  */
 
-#include <sort/sort_impl.cuh>
+#include <sort/sort_column_impl.cuh>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
-#include <thrust/functional.h>
 #include <thrust/sequence.h>
-#include <thrust/sort.h>
 
 namespace cudf {
 namespace detail {
-namespace {
-
-/**
- * @brief Type-dispatched functor for sorting a single column.
- */
-struct column_sorted_order_fn {
-  /**
-   * @brief Compile time check for allowing radix sort for column type.
-   *
-   * Floating point is removed here for special handling of NaNs.
-   */
-  template <typename T>
-  static constexpr bool is_radix_sort_supported()
-  {
-    return cudf::is_fixed_width<T>() && !cudf::is_floating_point<T>();
-  }
-
-  /**
-   * @brief Sorts fixed-width columns using faster thrust sort.
-   *
-   * @param input Column to sort
-   * @param indices Output sorted indices
-   * @param ascending True if sort order is ascending
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   */
-  template <typename T, std::enable_if_t<is_radix_sort_supported<T>()>* = nullptr>
-  void radix_sort(column_view const& input,
-                  mutable_column_view& indices,
-                  bool ascending,
-                  rmm::cuda_stream_view stream)
-  {
-    // A non-stable sort on a column of arithmetic type with no nulls will use a radix sort
-    // if specifying only the `thrust::less` or `thrust::greater` comparators.
-    // But this also requires making a copy of the input data.
-    auto temp_col = column(input, stream);
-    auto d_col    = temp_col.mutable_view();
-    if (ascending) {
-      thrust::sort_by_key(rmm::exec_policy(stream),
-                          d_col.begin<T>(),
-                          d_col.end<T>(),
-                          indices.begin<size_type>(),
-                          thrust::less<T>());
-    } else {
-      thrust::sort_by_key(rmm::exec_policy(stream),
-                          d_col.begin<T>(),
-                          d_col.end<T>(),
-                          indices.begin<size_type>(),
-                          thrust::greater<T>());
-    }
-  }
-  template <typename T, std::enable_if_t<!is_radix_sort_supported<T>()>* = nullptr>
-  void radix_sort(column_view const&, mutable_column_view&, bool, rmm::cuda_stream_view)
-  {
-    CUDF_FAIL("Only fixed-width types are suitable for faster sorting");
-  }
-
-  /**
-   * @brief Sorts a single column with a relationally comparable type.
-   *
-   * This includes numeric, timestamp, duration, and string types.
-   *
-   * @param input Column to sort
-   * @param indices Output sorted indices
-   * @param ascending True if sort order is ascending
-   * @param null_precedence How null rows are to be ordered
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   */
-  template <typename T, std::enable_if_t<cudf::is_relationally_comparable<T, T>()>* = nullptr>
-  void operator()(column_view const& input,
-                  mutable_column_view& indices,
-                  bool ascending,
-                  null_order null_precedence,
-                  rmm::cuda_stream_view stream)
-  {
-    // column with nulls or non-supported types will also use a comparator
-    if (input.has_nulls() || !is_radix_sort_supported<T>()) {
-      auto keys = column_device_view::create(input, stream);
-      thrust::sort(rmm::exec_policy(stream),
-                   indices.begin<size_type>(),
-                   indices.end<size_type>(),
-                   simple_comparator<T>{*keys, input.has_nulls(), ascending, null_precedence});
-    } else {
-      radix_sort<T>(input, indices, ascending, stream);
-    }
-  }
-
-  template <typename T, std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>* = nullptr>
-  void operator()(column_view const&, mutable_column_view&, bool, null_order, rmm::cuda_stream_view)
-  {
-    CUDF_FAIL("Column type must be relationally comparable");
-  }
-};
-
-}  // namespace
 
 /**
  * @copydoc
@@ -134,7 +42,7 @@ std::unique_ptr<column> sorted_order<false>(column_view const& input,
   thrust::sequence(
     rmm::exec_policy(stream), indices_view.begin<size_type>(), indices_view.end<size_type>(), 0);
   cudf::type_dispatcher<dispatch_storage_type>(input.type(),
-                                               column_sorted_order_fn{},
+                                               column_sorted_order_fn<false>{},
                                                input,
                                                indices_view,
                                                column_order == order::ASCENDING,
diff --git a/cpp/src/sort/sort_column_impl.cuh b/cpp/src/sort/sort_column_impl.cuh
new file mode 100644
index 00000000000..acafe4b5a5c
--- /dev/null
+++ b/cpp/src/sort/sort_column_impl.cuh
@@ -0,0 +1,152 @@
+/*
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include <sort/sort_impl.cuh>
+
+#include <thrust/sequence.h>
+#include <thrust/sort.h>
+
+namespace cudf {
+namespace detail {
+
+template <bool stable>
+struct column_sorted_order_fn {
+  /**
+   * @brief Compile time check for allowing faster sort.
+   *
+   * Faster sort is defined for fixed-width types where only
+   * the primitive comparators thrust::greater or thrust::less
+   * are needed.
+   *
+   * Floating point is removed here for special handling of NaNs
+   * which require the row-comparator.
+   */
+  template <typename T>
+  static constexpr bool is_faster_sort_supported()
+  {
+    return cudf::is_fixed_width<T>() && !cudf::is_floating_point<T>();
+  }
+
+  /**
+   * @brief Sorts fixed-width columns using faster thrust sort.
+   *
+   * Should not be called if `input.has_nulls()==true`
+   *
+   * @param input Column to sort
+   * @param indices Output sorted indices
+   * @param ascending True if sort order is ascending
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
+  template <typename T>
+  void faster_sort(column_view const& input,
+                   mutable_column_view& indices,
+                   bool ascending,
+                   rmm::cuda_stream_view stream)
+  {
+    // A thrust sort on a column of primitive types will use a radix sort.
+    // For other fixed-width types, thrust will use merge-sort.
+    // But this also requires making a copy of the input data.
+    auto temp_col = column(input, stream);
+    auto d_col    = temp_col.mutable_view();
+    if (ascending) {
+      if constexpr (stable) {
+        thrust::stable_sort_by_key(rmm::exec_policy(stream),
+                                   d_col.begin<T>(),
+                                   d_col.end<T>(),
+                                   indices.begin<size_type>(),
+                                   thrust::less<T>());
+      } else {
+        thrust::sort_by_key(rmm::exec_policy(stream),
+                            d_col.begin<T>(),
+                            d_col.end<T>(),
+                            indices.begin<size_type>(),
+                            thrust::less<T>());
+      }
+    } else {
+      if constexpr (stable) {
+        thrust::stable_sort_by_key(rmm::exec_policy(stream),
+                                   d_col.begin<T>(),
+                                   d_col.end<T>(),
+                                   indices.begin<size_type>(),
+                                   thrust::greater<T>());
+      } else {
+        thrust::sort_by_key(rmm::exec_policy(stream),
+                            d_col.begin<T>(),
+                            d_col.end<T>(),
+                            indices.begin<size_type>(),
+                            thrust::greater<T>());
+      }
+    }
+  }
+
+  /**
+   * @brief Sorts a single column with a relationally comparable type.
+   *
+   * This is used when a comparator is required.
+   *
+   * @param input Column to sort
+   * @param indices Output sorted indices
+   * @param ascending True if sort order is ascending
+   * @param null_precedence How null rows are to be ordered
+   * @param stream CUDA stream used for device memory operations and kernel launches
+   */
+  template <typename T>
+  void sorted_order(column_view const& input,
+                    mutable_column_view& indices,
+                    bool ascending,
+                    null_order null_precedence,
+                    rmm::cuda_stream_view stream)
+  {
+    auto keys = column_device_view::create(input, stream);
+    auto comp = simple_comparator<T>{*keys, input.has_nulls(), ascending, null_precedence};
+    if constexpr (stable) {
+      thrust::stable_sort(
+        rmm::exec_policy(stream), indices.begin<size_type>(), indices.end<size_type>(), comp);
+    } else {
+      thrust::sort(
+        rmm::exec_policy(stream), indices.begin<size_type>(), indices.end<size_type>(), comp);
+    }
+  }
+
+  template <typename T, CUDF_ENABLE_IF(cudf::is_relationally_comparable<T, T>())>
+  void operator()(column_view const& input,
+                  mutable_column_view& indices,
+                  bool ascending,
+                  null_order null_precedence,
+                  rmm::cuda_stream_view stream)
+  {
+    if constexpr (is_faster_sort_supported<T>()) {
+      if (input.has_nulls()) {
+        sorted_order<T>(input, indices, ascending, null_precedence, stream);
+      } else {
+        faster_sort<T>(input, indices, ascending, stream);
+      }
+    } else {
+      sorted_order<T>(input, indices, ascending, null_precedence, stream);
+    }
+  }
+
+  template <typename T, CUDF_ENABLE_IF(not cudf::is_relationally_comparable<T, T>())>
+  void operator()(column_view const&, mutable_column_view&, bool, null_order, rmm::cuda_stream_view)
+  {
+    CUDF_FAIL("Column type must be relationally comparable");
+  }
+};
+
+}  // namespace detail
+}  // namespace cudf
diff --git a/cpp/src/sort/sort_impl.cuh b/cpp/src/sort/sort_impl.cuh
index 97fc8ac14cb..fc024b42616 100644
--- a/cpp/src/sort/sort_impl.cuh
+++ b/cpp/src/sort/sort_impl.cuh
@@ -21,7 +21,6 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/table/experimental/row_operators.cuh>
-#include <cudf/table/row_operators.cuh>
 #include <cudf/table/table_device_view.cuh>
 #include <cudf/utilities/error.hpp>
 #include <cudf/utilities/traits.hpp>
diff --git a/cpp/src/sort/stable_sort.cu b/cpp/src/sort/stable_sort.cu
index a7e5d86ded0..ff2cb871162 100644
--- a/cpp/src/sort/stable_sort.cu
+++ b/cpp/src/sort/stable_sort.cu
@@ -65,7 +65,7 @@ std::unique_ptr<column> stable_sorted_order(table_view const& input,
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sorted_order(
-    input, column_order, null_precedence, cudf::default_stream_value, mr);
+    input, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> stable_sort_by_key(table_view const& values,
@@ -76,7 +76,7 @@ std::unique_ptr<table> stable_sort_by_key(table_view const& values,
 {
   CUDF_FUNC_RANGE();
   return detail::stable_sort_by_key(
-    values, keys, column_order, null_precedence, cudf::default_stream_value, mr);
+    values, keys, column_order, null_precedence, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/sort/stable_sort_column.cu b/cpp/src/sort/stable_sort_column.cu
index 7f8ab778f53..d11ddef1965 100644
--- a/cpp/src/sort/stable_sort_column.cu
+++ b/cpp/src/sort/stable_sort_column.cu
@@ -14,76 +14,16 @@
  * limitations under the License.
  */
 
-#include <sort/sort_impl.cuh>
+#include <sort/sort_column_impl.cuh>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/column/column_view.hpp>
+#include <cudf/utilities/type_dispatcher.hpp>
 
 #include <thrust/sequence.h>
-#include <thrust/sort.h>
 
 namespace cudf {
 namespace detail {
-namespace {
-
-struct column_stable_sorted_order_fn {
-  /**
-   * @brief Stable sort of fixed-width columns using a thrust sort with no comparator.
-   *
-   * @param input Column to sort
-   * @param indices Output sorted indices
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   */
-  template <typename T, std::enable_if_t<cudf::is_fixed_width<T>()>* = nullptr>
-  void faster_stable_sort(column_view const& input,
-                          mutable_column_view& indices,
-                          rmm::cuda_stream_view stream)
-  {
-    auto temp_col = column(input, stream);
-    auto d_col    = temp_col.mutable_view();
-    thrust::stable_sort_by_key(
-      rmm::exec_policy(stream), d_col.begin<T>(), d_col.end<T>(), indices.begin<size_type>());
-  }
-  template <typename T, std::enable_if_t<!cudf::is_fixed_width<T>()>* = nullptr>
-  void faster_stable_sort(column_view const&, mutable_column_view&, rmm::cuda_stream_view)
-  {
-    CUDF_FAIL("Only fixed-width types are suitable for faster stable sorting");
-  }
-
-  /**
-   * @brief Stable sorts a single column with a relationally comparable type.
-   *
-   * This includes numeric, timestamp, duration, and string types.
-   *
-   * @param input Column to sort
-   * @param indices Output sorted indices
-   * @param ascending True if sort order is ascending
-   * @param null_precedence How null rows are to be ordered
-   * @param stream CUDA stream used for device memory operations and kernel launches
-   */
-  template <typename T, std::enable_if_t<cudf::is_relationally_comparable<T, T>()>* = nullptr>
-  void operator()(column_view const& input,
-                  mutable_column_view& indices,
-                  bool ascending,
-                  null_order null_precedence,
-                  rmm::cuda_stream_view stream)
-  {
-    if (!ascending || input.has_nulls() || !cudf::is_fixed_width<T>()) {
-      auto keys = column_device_view::create(input, stream);
-      thrust::stable_sort(
-        rmm::exec_policy(stream),
-        indices.begin<size_type>(),
-        indices.end<size_type>(),
-        simple_comparator<T>{*keys, input.has_nulls(), ascending, null_precedence});
-    } else {
-      faster_stable_sort<T>(input, indices, stream);
-    }
-  }
-  template <typename T, std::enable_if_t<!cudf::is_relationally_comparable<T, T>()>* = nullptr>
-  void operator()(column_view const&, mutable_column_view&, bool, null_order, rmm::cuda_stream_view)
-  {
-    CUDF_FAIL("Column type must be relationally comparable");
-  }
-};
-
-}  // namespace
 
 /**
  * @copydoc
@@ -102,7 +42,7 @@ std::unique_ptr<column> sorted_order<true>(column_view const& input,
   thrust::sequence(
     rmm::exec_policy(stream), indices_view.begin<size_type>(), indices_view.end<size_type>(), 0);
   cudf::type_dispatcher<dispatch_storage_type>(input.type(),
-                                               column_stable_sorted_order_fn{},
+                                               column_sorted_order_fn<true>{},
                                                input,
                                                indices_view,
                                                column_order == order::ASCENDING,
diff --git a/cpp/src/stream_compaction/apply_boolean_mask.cu b/cpp/src/stream_compaction/apply_boolean_mask.cu
index 54688672d20..8f707f6d15d 100644
--- a/cpp/src/stream_compaction/apply_boolean_mask.cu
+++ b/cpp/src/stream_compaction/apply_boolean_mask.cu
@@ -93,6 +93,6 @@ std::unique_ptr<table> apply_boolean_mask(table_view const& input,
                                           rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::apply_boolean_mask(input, boolean_mask, cudf::default_stream_value, mr);
+  return detail::apply_boolean_mask(input, boolean_mask, cudf::get_default_stream(), mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/distinct.cu b/cpp/src/stream_compaction/distinct.cu
index a03e4c4441a..02889d4f447 100644
--- a/cpp/src/stream_compaction/distinct.cu
+++ b/cpp/src/stream_compaction/distinct.cu
@@ -159,7 +159,7 @@ std::unique_ptr<table> distinct(table_view const& input,
 {
   CUDF_FUNC_RANGE();
   return detail::distinct(
-    input, keys, keep, nulls_equal, nans_equal, cudf::default_stream_value, mr);
+    input, keys, keep, nulls_equal, nans_equal, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/distinct_count.cu b/cpp/src/stream_compaction/distinct_count.cu
index 96fcd8b53fc..09bd1ff7f5c 100644
--- a/cpp/src/stream_compaction/distinct_count.cu
+++ b/cpp/src/stream_compaction/distinct_count.cu
@@ -187,7 +187,7 @@ cudf::size_type distinct_count(column_view const& input,
                                nan_policy nan_handling)
 {
   CUDF_FUNC_RANGE();
-  return detail::distinct_count(input, null_handling, nan_handling);
+  return detail::distinct_count(input, null_handling, nan_handling, cudf::get_default_stream());
 }
 
 cudf::size_type distinct_count(table_view const& input, null_equality nulls_equal)
diff --git a/cpp/src/stream_compaction/drop_nans.cu b/cpp/src/stream_compaction/drop_nans.cu
index 4429c952277..a645b46f7a7 100644
--- a/cpp/src/stream_compaction/drop_nans.cu
+++ b/cpp/src/stream_compaction/drop_nans.cu
@@ -119,7 +119,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nans(input, keys, keep_threshold, cudf::default_stream_value, mr);
+  return detail::drop_nans(input, keys, keep_threshold, cudf::get_default_stream(), mr);
 }
 /*
  * Filters a table to remove nan elements.
@@ -129,7 +129,7 @@ std::unique_ptr<table> drop_nans(table_view const& input,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nans(input, keys, keys.size(), cudf::default_stream_value, mr);
+  return detail::drop_nans(input, keys, keys.size(), cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/drop_nulls.cu b/cpp/src/stream_compaction/drop_nulls.cu
index c5f3e0df1e2..6ea1fd4c31f 100644
--- a/cpp/src/stream_compaction/drop_nulls.cu
+++ b/cpp/src/stream_compaction/drop_nulls.cu
@@ -92,7 +92,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nulls(input, keys, keep_threshold, cudf::default_stream_value, mr);
+  return detail::drop_nulls(input, keys, keep_threshold, cudf::get_default_stream(), mr);
 }
 /*
  * Filters a table to remove null elements.
@@ -102,7 +102,7 @@ std::unique_ptr<table> drop_nulls(table_view const& input,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::drop_nulls(input, keys, keys.size(), cudf::default_stream_value, mr);
+  return detail::drop_nulls(input, keys, keys.size(), cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique.cu b/cpp/src/stream_compaction/unique.cu
index 83c51a92633..6b432176acb 100644
--- a/cpp/src/stream_compaction/unique.cu
+++ b/cpp/src/stream_compaction/unique.cu
@@ -99,7 +99,7 @@ std::unique_ptr<table> unique(table_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique(input, keys, keep, nulls_equal, cudf::default_stream_value, mr);
+  return detail::unique(input, keys, keep, nulls_equal, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/stream_compaction/unique_count.cu b/cpp/src/stream_compaction/unique_count.cu
index 8a793ef4729..8363ee8120b 100644
--- a/cpp/src/stream_compaction/unique_count.cu
+++ b/cpp/src/stream_compaction/unique_count.cu
@@ -127,7 +127,7 @@ cudf::size_type unique_count(column_view const& input,
                              nan_policy nan_handling)
 {
   CUDF_FUNC_RANGE();
-  return detail::unique_count(input, null_handling, nan_handling);
+  return detail::unique_count(input, null_handling, nan_handling, cudf::get_default_stream());
 }
 
 cudf::size_type unique_count(table_view const& input, null_equality nulls_equal)
diff --git a/cpp/src/strings/attributes.cu b/cpp/src/strings/attributes.cu
index 0dd1a870b8a..127d3aa8fe7 100644
--- a/cpp/src/strings/attributes.cu
+++ b/cpp/src/strings/attributes.cu
@@ -87,19 +87,17 @@ std::unique_ptr<column> counts_fn(strings_column_view const& strings,
 
 }  // namespace
 
-std::unique_ptr<column> count_characters(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> count_characters(strings_column_view const& strings,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   auto ufn = [] __device__(const string_view& d_str) { return d_str.length(); };
   return counts_fn(strings, ufn, stream, mr);
 }
 
-std::unique_ptr<column> count_bytes(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> count_bytes(strings_column_view const& strings,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto ufn = [] __device__(const string_view& d_str) { return d_str.size_bytes(); };
   return counts_fn(strings, ufn, stream, mr);
@@ -135,10 +133,9 @@ struct code_points_fn {
 
 namespace detail {
 //
-std::unique_ptr<column> code_points(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> code_points(strings_column_view const& strings,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
@@ -185,21 +182,21 @@ std::unique_ptr<column> count_characters(strings_column_view const& strings,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_characters(strings, cudf::default_stream_value, mr);
+  return detail::count_characters(strings, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> count_bytes(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_bytes(strings, cudf::default_stream_value, mr);
+  return detail::count_bytes(strings, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> code_points(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::code_points(strings, cudf::default_stream_value, mr);
+  return detail::code_points(strings, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/capitalize.cu b/cpp/src/strings/capitalize.cu
index 4328765773f..58134ab28d1 100644
--- a/cpp/src/strings/capitalize.cu
+++ b/cpp/src/strings/capitalize.cu
@@ -289,7 +289,7 @@ std::unique_ptr<column> capitalize(strings_column_view const& input,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::capitalize(input, delimiter, cudf::default_stream_value, mr);
+  return detail::capitalize(input, delimiter, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> title(strings_column_view const& input,
@@ -297,14 +297,14 @@ std::unique_ptr<column> title(strings_column_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::title(input, sequence_type, cudf::default_stream_value, mr);
+  return detail::title(input, sequence_type, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> is_title(strings_column_view const& input,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_title(input, cudf::default_stream_value, mr);
+  return detail::is_title(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/case.cu b/cpp/src/strings/case.cu
index cabb1241f1b..a2cee757112 100644
--- a/cpp/src/strings/case.cu
+++ b/cpp/src/strings/case.cu
@@ -147,30 +147,27 @@ std::unique_ptr<column> convert_case(strings_column_view const& strings,
 
 }  // namespace
 
-std::unique_ptr<column> to_lower(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> to_lower(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   character_flags_table_type case_flag = IS_UPPER(0xFF);  // convert only upper case characters
   return convert_case(strings, case_flag, stream, mr);
 }
 
 //
-std::unique_ptr<column> to_upper(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> to_upper(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   character_flags_table_type case_flag = IS_LOWER(0xFF);  // convert only lower case characters
   return convert_case(strings, case_flag, stream, mr);
 }
 
 //
-std::unique_ptr<column> swapcase(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> swapcase(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   // convert only upper or lower case characters
   character_flags_table_type case_flag = IS_LOWER(0xFF) | IS_UPPER(0xFF);
@@ -185,21 +182,21 @@ std::unique_ptr<column> to_lower(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_lower(strings, cudf::default_stream_value, mr);
+  return detail::to_lower(strings, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> to_upper(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_upper(strings, cudf::default_stream_value, mr);
+  return detail::to_upper(strings, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> swapcase(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::swapcase(strings, cudf::default_stream_value, mr);
+  return detail::swapcase(strings, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/char_types/char_types.cu b/cpp/src/strings/char_types/char_types.cu
index 4010ec8861a..aa1e4dce4d0 100644
--- a/cpp/src/strings/char_types/char_types.cu
+++ b/cpp/src/strings/char_types/char_types.cu
@@ -38,12 +38,11 @@ namespace cudf {
 namespace strings {
 namespace detail {
 //
-std::unique_ptr<column> all_characters_of_type(
-  strings_column_view const& strings,
-  string_character_types types,
-  string_character_types verify_types,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> all_characters_of_type(strings_column_view const& strings,
+                                               string_character_types types,
+                                               string_character_types verify_types,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
 {
   auto strings_count  = strings.size();
   auto strings_column = column_device_view::create(strings.parent(), stream);
@@ -197,7 +196,7 @@ std::unique_ptr<column> all_characters_of_type(strings_column_view const& string
 {
   CUDF_FUNC_RANGE();
   return detail::all_characters_of_type(
-    strings, types, verify_types, cudf::default_stream_value, mr);
+    strings, types, verify_types, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> filter_characters_of_type(strings_column_view const& strings,
@@ -208,7 +207,7 @@ std::unique_ptr<column> filter_characters_of_type(strings_column_view const& str
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters_of_type(
-    strings, types_to_remove, replacement, types_to_keep, cudf::default_stream_value, mr);
+    strings, types_to_remove, replacement, types_to_keep, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine/concatenate.cu b/cpp/src/strings/combine/concatenate.cu
index ae94348cbb4..e98ae537ddd 100644
--- a/cpp/src/strings/combine/concatenate.cu
+++ b/cpp/src/strings/combine/concatenate.cu
@@ -270,7 +270,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
 {
   CUDF_FUNC_RANGE();
   return detail::concatenate(
-    strings_columns, separator, narep, separate_nulls, cudf::default_stream_value, mr);
+    strings_columns, separator, narep, separate_nulls, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> concatenate(table_view const& strings_columns,
@@ -286,7 +286,7 @@ std::unique_ptr<column> concatenate(table_view const& strings_columns,
                              separator_narep,
                              col_narep,
                              separate_nulls,
-                             cudf::default_stream_value,
+                             cudf::get_default_stream(),
                              mr);
 }
 
diff --git a/cpp/src/strings/combine/join.cu b/cpp/src/strings/combine/join.cu
index f450ce4019e..6537ce168e5 100644
--- a/cpp/src/strings/combine/join.cu
+++ b/cpp/src/strings/combine/join.cu
@@ -135,7 +135,7 @@ std::unique_ptr<column> join_strings(strings_column_view const& strings,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::join_strings(strings, separator, narep, cudf::default_stream_value, mr);
+  return detail::join_strings(strings, separator, narep, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/combine/join_list_elements.cu b/cpp/src/strings/combine/join_list_elements.cu
index 1d0ee94d306..ec2e65d7ad5 100644
--- a/cpp/src/strings/combine/join_list_elements.cu
+++ b/cpp/src/strings/combine/join_list_elements.cu
@@ -308,7 +308,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                     narep,
                                     separate_nulls,
                                     empty_list_policy,
-                                    cudf::default_stream_value,
+                                    cudf::get_default_stream(),
                                     mr);
 }
 
@@ -327,7 +327,7 @@ std::unique_ptr<column> join_list_elements(lists_column_view const& lists_string
                                     string_narep,
                                     separate_nulls,
                                     empty_list_policy,
-                                    cudf::default_stream_value,
+                                    cudf::get_default_stream(),
                                     mr);
 }
 
diff --git a/cpp/src/strings/contains.cu b/cpp/src/strings/contains.cu
index b7d154c4808..eafc78be8da 100644
--- a/cpp/src/strings/contains.cu
+++ b/cpp/src/strings/contains.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -57,8 +58,7 @@ struct contains_fn {
 };
 
 std::unique_ptr<column> contains_impl(strings_column_view const& input,
-                                      std::string_view pattern,
-                                      regex_flags const flags,
+                                      regex_program const& prog,
                                       bool const beginning_only,
                                       rmm::cuda_stream_view stream,
                                       rmm::mr::device_memory_resource* mr)
@@ -71,7 +71,7 @@ std::unique_ptr<column> contains_impl(strings_column_view const& input,
                                      mr);
   if (input.is_empty()) { return results; }
 
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream);
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   auto d_results       = results->mutable_view().data<bool>();
   auto const d_strings = column_device_view::create(input.parent(), stream);
@@ -86,35 +86,29 @@ std::unique_ptr<column> contains_impl(strings_column_view const& input,
 
 }  // namespace
 
-std::unique_ptr<column> contains_re(
-  strings_column_view const& input,
-  std::string_view pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> contains_re(strings_column_view const& input,
+                                    regex_program const& prog,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
-  return contains_impl(input, pattern, flags, false, stream, mr);
+  return contains_impl(input, prog, false, stream, mr);
 }
 
-std::unique_ptr<column> matches_re(
-  strings_column_view const& input,
-  std::string_view pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> matches_re(strings_column_view const& input,
+                                   regex_program const& prog,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
-  return contains_impl(input, pattern, flags, true, stream, mr);
+  return contains_impl(input, prog, true, stream, mr);
 }
 
-std::unique_ptr<column> count_re(
-  strings_column_view const& input,
-  std::string_view pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> count_re(strings_column_view const& input,
+                                 regex_program const& prog,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
-  // compile regex into device object
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   auto const d_strings = column_device_view::create(input.parent(), stream);
 
@@ -136,7 +130,16 @@ std::unique_ptr<column> contains_re(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains_re(strings, pattern, flags, cudf::default_stream_value, mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE);
+  return detail::contains_re(strings, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> contains_re(strings_column_view const& strings,
+                                    regex_program const& prog,
+                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::contains_re(strings, prog, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> matches_re(strings_column_view const& strings,
@@ -145,7 +148,16 @@ std::unique_ptr<column> matches_re(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::matches_re(strings, pattern, flags, cudf::default_stream_value, mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE);
+  return detail::matches_re(strings, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> matches_re(strings_column_view const& strings,
+                                   regex_program const& prog,
+                                   rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::matches_re(strings, prog, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> count_re(strings_column_view const& strings,
@@ -154,7 +166,16 @@ std::unique_ptr<column> count_re(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_re(strings, pattern, flags, cudf::default_stream_value, mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE);
+  return detail::count_re(strings, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> count_re(strings_column_view const& strings,
+                                 regex_program const& prog,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::count_re(strings, prog, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_booleans.cu b/cpp/src/strings/convert/convert_booleans.cu
index 196929a9377..da4728da331 100644
--- a/cpp/src/strings/convert/convert_booleans.cu
+++ b/cpp/src/strings/convert/convert_booleans.cu
@@ -86,7 +86,7 @@ std::unique_ptr<column> to_booleans(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_booleans(strings, true_string, cudf::default_stream_value, mr);
+  return detail::to_booleans(strings, true_string, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -155,7 +155,7 @@ std::unique_ptr<column> from_booleans(column_view const& booleans,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_booleans(booleans, true_string, false_string, cudf::default_stream_value, mr);
+  return detail::from_booleans(booleans, true_string, false_string, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_datetime.cu b/cpp/src/strings/convert/convert_datetime.cu
index e70ae09de84..0cc2ef341d4 100644
--- a/cpp/src/strings/convert/convert_datetime.cu
+++ b/cpp/src/strings/convert/convert_datetime.cu
@@ -653,7 +653,7 @@ std::unique_ptr<cudf::column> to_timestamps(strings_column_view const& input,
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_timestamps(input, timestamp_type, format, cudf::default_stream_value, mr);
+  return detail::to_timestamps(input, timestamp_type, format, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
@@ -661,7 +661,7 @@ std::unique_ptr<cudf::column> is_timestamp(strings_column_view const& input,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_timestamp(input, format, cudf::default_stream_value, mr);
+  return detail::is_timestamp(input, format, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -1149,7 +1149,7 @@ std::unique_ptr<column> from_timestamps(column_view const& timestamps,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_timestamps(timestamps, format, names, cudf::default_stream_value, mr);
+  return detail::from_timestamps(timestamps, format, names, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_durations.cu b/cpp/src/strings/convert/convert_durations.cu
index ac64bceae54..0e2092fd31c 100644
--- a/cpp/src/strings/convert/convert_durations.cu
+++ b/cpp/src/strings/convert/convert_durations.cu
@@ -748,7 +748,7 @@ std::unique_ptr<column> from_durations(column_view const& durations,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_durations(durations, format, cudf::default_stream_value, mr);
+  return detail::from_durations(durations, format, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> to_durations(strings_column_view const& strings,
@@ -757,7 +757,7 @@ std::unique_ptr<column> to_durations(strings_column_view const& strings,
                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_durations(strings, duration_type, format, cudf::default_stream_value, mr);
+  return detail::to_durations(strings, duration_type, format, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_fixed_point.cu b/cpp/src/strings/convert/convert_fixed_point.cu
index 94bb235d1cd..402be192572 100644
--- a/cpp/src/strings/convert/convert_fixed_point.cu
+++ b/cpp/src/strings/convert/convert_fixed_point.cu
@@ -191,7 +191,7 @@ std::unique_ptr<column> to_fixed_point(strings_column_view const& strings,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_fixed_point(strings, output_type, cudf::default_stream_value, mr);
+  return detail::to_fixed_point(strings, output_type, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -334,7 +334,7 @@ std::unique_ptr<column> from_fixed_point(column_view const& input,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_fixed_point(input, cudf::default_stream_value, mr);
+  return detail::from_fixed_point(input, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -398,7 +398,7 @@ std::unique_ptr<column> is_fixed_point(strings_column_view const& input,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_fixed_point(input, decimal_type, cudf::default_stream_value, mr);
+  return detail::is_fixed_point(input, decimal_type, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_floats.cu b/cpp/src/strings/convert/convert_floats.cu
index 4c11707f2c6..49713731ff5 100644
--- a/cpp/src/strings/convert/convert_floats.cu
+++ b/cpp/src/strings/convert/convert_floats.cu
@@ -125,7 +125,7 @@ std::unique_ptr<column> to_floats(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_floats(strings, output_type, cudf::default_stream_value, mr);
+  return detail::to_floats(strings, output_type, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -450,14 +450,13 @@ std::unique_ptr<column> from_floats(column_view const& floats,
 std::unique_ptr<column> from_floats(column_view const& floats, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_floats(floats, cudf::default_stream_value, mr);
+  return detail::from_floats(floats, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
-std::unique_ptr<column> is_float(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> is_float(strings_column_view const& strings,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   auto strings_column = column_device_view::create(strings.parent(), stream);
   auto d_column       = *strings_column;
@@ -489,7 +488,7 @@ std::unique_ptr<column> is_float(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_float(strings, cudf::default_stream_value, mr);
+  return detail::is_float(strings, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_hex.cu b/cpp/src/strings/convert/convert_hex.cu
index c327f7da00e..f41232a4af6 100644
--- a/cpp/src/strings/convert/convert_hex.cu
+++ b/cpp/src/strings/convert/convert_hex.cu
@@ -206,11 +206,10 @@ struct dispatch_integers_to_hex_fn {
 }  // namespace
 
 // This will convert a strings column into any integer column type.
-std::unique_ptr<column> hex_to_integers(
-  strings_column_view const& strings,
-  data_type output_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
+                                        data_type output_type,
+                                        rmm::cuda_stream_view stream,
+                                        rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(output_type);
@@ -284,21 +283,21 @@ std::unique_ptr<column> hex_to_integers(strings_column_view const& strings,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::hex_to_integers(strings, output_type, cudf::default_stream_value, mr);
+  return detail::hex_to_integers(strings, output_type, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> is_hex(strings_column_view const& strings,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_hex(strings, cudf::default_stream_value, mr);
+  return detail::is_hex(strings, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> integers_to_hex(column_view const& input,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::integers_to_hex(input, cudf::default_stream_value, mr);
+  return detail::integers_to_hex(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_integers.cu b/cpp/src/strings/convert/convert_integers.cu
index abce70ef4d5..ed40c47b99d 100644
--- a/cpp/src/strings/convert/convert_integers.cu
+++ b/cpp/src/strings/convert/convert_integers.cu
@@ -157,10 +157,9 @@ struct dispatch_is_integer_fn {
 
 }  // namespace
 
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   auto const d_column = column_device_view::create(strings.parent(), stream);
   auto results        = make_numeric_column(data_type{type_id::BOOL8},
@@ -192,11 +191,10 @@ std::unique_ptr<column> is_integer(
   return results;
 }
 
-std::unique_ptr<column> is_integer(
-  strings_column_view const& strings,
-  data_type int_type,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> is_integer(strings_column_view const& strings,
+                                   data_type int_type,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   if (strings.is_empty()) { return cudf::make_empty_column(type_id::BOOL8); }
   return type_dispatcher(int_type, dispatch_is_integer_fn{}, strings, stream, mr);
@@ -209,7 +207,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, cudf::default_stream_value, mr);
+  return detail::is_integer(strings, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> is_integer(strings_column_view const& strings,
@@ -217,7 +215,7 @@ std::unique_ptr<column> is_integer(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_integer(strings, int_type, cudf::default_stream_value, mr);
+  return detail::is_integer(strings, int_type, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -310,7 +308,7 @@ std::unique_ptr<column> to_integers(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::to_integers(strings, output_type, cudf::default_stream_value, mr);
+  return detail::to_integers(strings, output_type, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -431,7 +429,7 @@ std::unique_ptr<column> from_integers(column_view const& integers,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::from_integers(integers, cudf::default_stream_value, mr);
+  return detail::from_integers(integers, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_ipv4.cu b/cpp/src/strings/convert/convert_ipv4.cu
index 4dbdd3fc9d8..0dcb2b61446 100644
--- a/cpp/src/strings/convert/convert_ipv4.cu
+++ b/cpp/src/strings/convert/convert_ipv4.cu
@@ -75,10 +75,9 @@ struct ipv4_to_integers_fn {
 }  // namespace
 
 // Convert strings column of IPv4 addresses to integers column
-std::unique_ptr<column> ipv4_to_integers(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> ipv4_to_integers(strings_column_view const& strings,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_numeric_column(data_type{type_id::INT64}, 0);
@@ -110,7 +109,7 @@ std::unique_ptr<column> ipv4_to_integers(strings_column_view const& strings,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ipv4_to_integers(strings, cudf::default_stream_value, mr);
+  return detail::ipv4_to_integers(strings, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -162,10 +161,9 @@ struct integers_to_ipv4_fn {
 }  // namespace
 
 // Convert integers into IPv4 addresses
-std::unique_ptr<column> integers_to_ipv4(
-  column_view const& integers,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
+                                         rmm::cuda_stream_view stream,
+                                         rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = integers.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -264,14 +262,14 @@ std::unique_ptr<column> integers_to_ipv4(column_view const& integers,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::integers_to_ipv4(integers, cudf::default_stream_value, mr);
+  return detail::integers_to_ipv4(integers, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> is_ipv4(strings_column_view const& strings,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_ipv4(strings, cudf::default_stream_value, mr);
+  return detail::is_ipv4(strings, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_lists.cu b/cpp/src/strings/convert/convert_lists.cu
index 547052d5680..289fa9a1c05 100644
--- a/cpp/src/strings/convert/convert_lists.cu
+++ b/cpp/src/strings/convert/convert_lists.cu
@@ -235,7 +235,7 @@ std::unique_ptr<column> format_list_column(lists_column_view const& input,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::format_list_column(input, na_rep, separators, cudf::default_stream_value, mr);
+  return detail::format_list_column(input, na_rep, separators, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/convert/convert_urls.cu b/cpp/src/strings/convert/convert_urls.cu
index ca32383e73f..25e37526f59 100644
--- a/cpp/src/strings/convert/convert_urls.cu
+++ b/cpp/src/strings/convert/convert_urls.cu
@@ -129,10 +129,9 @@ struct url_encoder_fn {
 }  // namespace
 
 //
-std::unique_ptr<column> url_encode(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> url_encode(strings_column_view const& strings,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -172,7 +171,7 @@ std::unique_ptr<column> url_encode(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::url_encode(strings, cudf::default_stream_value, mr);
+  return detail::url_encode(strings, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -388,10 +387,9 @@ __global__ void url_decode_char_replacer(column_device_view const in_strings,
 }  // namespace
 
 //
-std::unique_ptr<column> url_decode(
-  strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> url_decode(strings_column_view const& strings,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -454,7 +452,7 @@ std::unique_ptr<column> url_decode(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::url_decode(strings, cudf::default_stream_value, mr);
+  return detail::url_decode(strings, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/copying/concatenate.cu b/cpp/src/strings/copying/concatenate.cu
index 627e689d4d9..e44c343e31b 100644
--- a/cpp/src/strings/copying/concatenate.cu
+++ b/cpp/src/strings/copying/concatenate.cu
@@ -127,10 +127,8 @@ __global__ void fused_concatenate_string_offset_kernel(column_device_view const*
   if (Nullable) { active_mask = __ballot_sync(0xFFFF'FFFFu, output_index < output_size); }
   while (output_index < output_size) {
     // Lookup input index by searching for output index in offsets
-    // thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
-    auto const offset_it =
-      -1 + thrust::upper_bound(
-             thrust::seq, input_offsets, input_offsets + num_input_views, output_index);
+    auto const offset_it            = thrust::prev(thrust::upper_bound(
+      thrust::seq, input_offsets, input_offsets + num_input_views, output_index));
     size_type const partition_index = offset_it - input_offsets;
 
     auto const offset_index      = output_index - *offset_it;
@@ -180,10 +178,8 @@ __global__ void fused_concatenate_string_chars_kernel(column_device_view const*
 
   while (output_index < output_size) {
     // Lookup input index by searching for output index in offsets
-    // thrust::prev isn't in CUDA 10.0, so subtracting 1 here instead
-    auto const offset_it =
-      -1 + thrust::upper_bound(
-             thrust::seq, partition_offsets, partition_offsets + num_input_views, output_index);
+    auto const offset_it            = thrust::prev(thrust::upper_bound(
+      thrust::seq, partition_offsets, partition_offsets + num_input_views, output_index));
     size_type const partition_index = offset_it - partition_offsets;
 
     auto const offset_index = output_index - *offset_it;
diff --git a/cpp/src/strings/count_matches.hpp b/cpp/src/strings/count_matches.hpp
index d4bcdaf4042..a4f76c1c5e3 100644
--- a/cpp/src/strings/count_matches.hpp
+++ b/cpp/src/strings/count_matches.hpp
@@ -41,12 +41,11 @@ class reprog_device;
  * @param mr Device memory resource used to allocate the returned column's device memory.
  * @return Integer column of match counts
  */
-std::unique_ptr<column> count_matches(
-  column_device_view const& d_strings,
-  reprog_device& d_prog,
-  size_type output_size,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource());
+std::unique_ptr<column> count_matches(column_device_view const& d_strings,
+                                      reprog_device& d_prog,
+                                      size_type output_size,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr);
 
 }  // namespace detail
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract.cu b/cpp/src/strings/extract/extract.cu
index 76d2f84b1a0..f99b0e63715 100644
--- a/cpp/src/strings/extract/extract.cu
+++ b/cpp/src/strings/extract/extract.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -86,13 +87,12 @@ struct extract_fn {
 
 //
 std::unique_ptr<table> extract(strings_column_view const& input,
-                               std::string_view pattern,
-                               regex_flags const flags,
+                               regex_program const& prog,
                                rmm::cuda_stream_view stream,
                                rmm::mr::device_memory_resource* mr)
 {
-  // compile regex into device object
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "Group indicators not found in regex pattern");
@@ -136,7 +136,16 @@ std::unique_ptr<table> extract(strings_column_view const& strings,
                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract(strings, pattern, flags, cudf::default_stream_value, mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::EXTRACT);
+  return detail::extract(strings, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<table> extract(strings_column_view const& strings,
+                               regex_program const& prog,
+                               rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract(strings, prog, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/extract/extract_all.cu b/cpp/src/strings/extract/extract_all.cu
index 76c2788c1be..c27834dae19 100644
--- a/cpp/src/strings/extract/extract_all.cu
+++ b/cpp/src/strings/extract/extract_all.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column_device_view.cuh>
@@ -95,18 +96,17 @@ struct extract_fn {
  *
  * @param stream CUDA stream used for device memory operations and kernel launches.
  */
-std::unique_ptr<column> extract_all_record(
-  strings_column_view const& input,
-  std::string_view pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> extract_all_record(strings_column_view const& input,
+                                           regex_program const& prog,
+                                           rmm::cuda_stream_view stream,
+                                           rmm::mr::device_memory_resource* mr)
 {
   auto const strings_count = input.size();
   auto const d_strings     = column_device_view::create(input.parent(), stream);
 
-  // Compile regex into device object.
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
+
   // The extract pattern should always include groups.
   auto const groups = d_prog->group_counts();
   CUDF_EXPECTS(groups > 0, "extract_all requires group indicators in the regex pattern.");
@@ -171,7 +171,16 @@ std::unique_ptr<column> extract_all_record(strings_column_view const& strings,
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::extract_all_record(strings, pattern, flags, cudf::default_stream_value, mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::EXTRACT);
+  return detail::extract_all_record(strings, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> extract_all_record(strings_column_view const& strings,
+                                           regex_program const& prog,
+                                           rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::extract_all_record(strings, prog, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/filling/fill.cu b/cpp/src/strings/filling/fill.cu
index a858a3d6238..4bd98ee4cdc 100644
--- a/cpp/src/strings/filling/fill.cu
+++ b/cpp/src/strings/filling/fill.cu
@@ -19,7 +19,7 @@
 #include <cudf/detail/valid_if.cuh>
 #include <cudf/null_mask.hpp>
 #include <cudf/scalar/scalar_device_view.cuh>
-#include <cudf/strings/combine.hpp>
+#include <cudf/strings/detail/fill.hpp>
 #include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
@@ -35,13 +35,12 @@
 namespace cudf {
 namespace strings {
 namespace detail {
-std::unique_ptr<column> fill(
-  strings_column_view const& strings,
-  size_type begin,
-  size_type end,
-  string_scalar const& value,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> fill(strings_column_view const& strings,
+                             size_type begin,
+                             size_type end,
+                             string_scalar const& value,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   auto strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -58,14 +57,18 @@ std::unique_ptr<column> fill(
   auto d_strings      = *strings_column;
 
   // create resulting null mask
-  auto valid_mask = cudf::detail::valid_if(
-    thrust::make_counting_iterator<size_type>(0),
-    thrust::make_counting_iterator<size_type>(strings_count),
-    [d_strings, begin, end, d_value] __device__(size_type idx) {
-      return ((begin <= idx) && (idx < end)) ? d_value.is_valid() : !d_strings.is_null(idx);
-    },
-    stream,
-    mr);
+  auto valid_mask = [begin, end, d_value, value, d_strings, stream, mr] {
+    if (begin == 0 and end == d_strings.size() and value.is_valid(stream))
+      return std::pair(rmm::device_buffer{}, 0);
+    return cudf::detail::valid_if(
+      thrust::make_counting_iterator<size_type>(0),
+      thrust::make_counting_iterator<size_type>(d_strings.size()),
+      [d_strings, begin, end, d_value] __device__(size_type idx) {
+        return ((begin <= idx) && (idx < end)) ? d_value.is_valid() : !d_strings.is_null(idx);
+      },
+      stream,
+      mr);
+  }();
   auto null_count               = valid_mask.second;
   rmm::device_buffer& null_mask = valid_mask.first;
 
diff --git a/cpp/src/strings/filter_chars.cu b/cpp/src/strings/filter_chars.cu
index 7f0332ba9cc..b30b0e89c28 100644
--- a/cpp/src/strings/filter_chars.cu
+++ b/cpp/src/strings/filter_chars.cu
@@ -160,7 +160,7 @@ std::unique_ptr<column> filter_characters(
 {
   CUDF_FUNC_RANGE();
   return detail::filter_characters(
-    strings, characters_to_filter, keep_characters, replacement, cudf::default_stream_value, mr);
+    strings, characters_to_filter, keep_characters, replacement, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/json/json_path.cu b/cpp/src/strings/json/json_path.cu
index 9ec1ec248e5..5ce00eeee4a 100644
--- a/cpp/src/strings/json/json_path.cu
+++ b/cpp/src/strings/json/json_path.cu
@@ -570,9 +570,10 @@ class path_state : private parser {
               op.type          = path_operator_type::CHILD;
               op.expected_type = OBJECT;
             } else {
-              op.type  = path_operator_type::CHILD_INDEX;
-              op.index = cudf::io::parse_numeric<int>(
-                op.name.data(), op.name.data() + op.name.size_bytes(), json_opts, -1);
+              op.type          = path_operator_type::CHILD_INDEX;
+              auto const value = cudf::io::parse_numeric<int>(
+                op.name.data(), op.name.data() + op.name.size_bytes(), json_opts);
+              op.index = value.value_or(-1);
               CUDF_EXPECTS(op.index >= 0, "Invalid numeric index specified in JSONPath");
               op.expected_type = ARRAY;
             }
@@ -1047,7 +1048,7 @@ std::unique_ptr<cudf::column> get_json_object(cudf::strings_column_view const& c
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::get_json_object(col, json_path, options, cudf::default_stream_value, mr);
+  return detail::get_json_object(col, json_path, options, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/like.cu b/cpp/src/strings/like.cu
index 2d3a3d3d52a..4e4df6cb1ad 100644
--- a/cpp/src/strings/like.cu
+++ b/cpp/src/strings/like.cu
@@ -102,12 +102,11 @@ struct like_fn {
 
 }  // namespace
 
-std::unique_ptr<column> like(
-  strings_column_view const& input,
-  string_scalar const& pattern,
-  string_scalar const& escape_character,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> like(strings_column_view const& input,
+                             string_scalar const& pattern,
+                             string_scalar const& escape_character,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   auto results = make_numeric_column(data_type{type_id::BOOL8},
                                      input.size(),
@@ -147,7 +146,7 @@ std::unique_ptr<column> like(strings_column_view const& input,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::like(input, pattern, escape_character, cudf::default_stream_value, mr);
+  return detail::like(input, pattern, escape_character, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/padding.cu b/cpp/src/strings/padding.cu
index e601eeb6b6e..e5497849681 100644
--- a/cpp/src/strings/padding.cu
+++ b/cpp/src/strings/padding.cu
@@ -20,8 +20,7 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
-#include <cudf/strings/detail/utilities.cuh>
-#include <cudf/strings/detail/utilities.hpp>
+#include <cudf/strings/detail/pad_impl.cuh>
 #include <cudf/strings/padding.hpp>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
@@ -38,6 +37,7 @@ namespace cudf {
 namespace strings {
 namespace detail {
 namespace {
+
 struct compute_pad_output_length_fn {
   column_device_view d_strings;
   size_type width;
@@ -47,23 +47,18 @@ struct compute_pad_output_length_fn {
   {
     if (d_strings.is_null(idx)) return 0;
     string_view d_str = d_strings.element<string_view>(idx);
-    size_type bytes   = d_str.size_bytes();
-    size_type length  = d_str.length();
-    if (width > length)                            // no truncating
-      bytes += fill_char_size * (width - length);  // add padding
-    return bytes;
+    return compute_padded_size(d_str, width, fill_char_size);
   }
 };
 
 }  // namespace
 
-std::unique_ptr<column> pad(
-  strings_column_view const& strings,
-  size_type width,
-  side_type side                      = side_type::RIGHT,
-  std::string_view fill_char          = " ",
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> pad(strings_column_view const& strings,
+                            size_type width,
+                            side_type side,
+                            std::string_view fill_char,
+                            rmm::cuda_stream_view stream,
+                            rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -96,13 +91,10 @@ std::unique_ptr<column> pad(
       thrust::make_counting_iterator<cudf::size_type>(0),
       strings_count,
       [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) {
-        if (d_strings.is_null(idx)) return;
-        string_view d_str = d_strings.element<string_view>(idx);
-        auto length       = d_str.length();
-        char* ptr         = d_chars + d_offsets[idx];
-        while (length++ < width)
-          ptr += from_char_utf8(d_fill_char, ptr);
-        copy_string(ptr, d_str);
+        if (d_strings.is_valid(idx)) {
+          pad_impl<side_type::LEFT>(
+            d_strings.element<string_view>(idx), width, d_fill_char, d_chars + d_offsets[idx]);
+        }
       });
   } else if (side == side_type::RIGHT) {
     thrust::for_each_n(
@@ -110,13 +102,10 @@ std::unique_ptr<column> pad(
       thrust::make_counting_iterator<cudf::size_type>(0),
       strings_count,
       [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) {
-        if (d_strings.is_null(idx)) return;
-        string_view d_str = d_strings.element<string_view>(idx);
-        auto length       = d_str.length();
-        char* ptr         = d_chars + d_offsets[idx];
-        ptr               = copy_string(ptr, d_str);
-        while (length++ < width)
-          ptr += from_char_utf8(d_fill_char, ptr);
+        if (d_strings.is_valid(idx)) {
+          pad_impl<side_type::RIGHT>(
+            d_strings.element<string_view>(idx), width, d_fill_char, d_chars + d_offsets[idx]);
+        }
       });
   } else if (side == side_type::BOTH) {
     thrust::for_each_n(
@@ -124,18 +113,10 @@ std::unique_ptr<column> pad(
       thrust::make_counting_iterator<cudf::size_type>(0),
       strings_count,
       [d_strings, width, d_fill_char, d_offsets, d_chars] __device__(size_type idx) {
-        if (d_strings.is_null(idx)) return;
-        string_view d_str = d_strings.element<string_view>(idx);
-        char* ptr         = d_chars + d_offsets[idx];
-        auto pad          = static_cast<int32_t>(width - d_str.length());
-        auto right_pad    = (width & 1) ? pad / 2 : (pad - pad / 2);  // odd width = right-justify
-        auto left_pad =
-          pad - right_pad;  // e.g. width=7 gives "++foxx+" while width=6 gives "+fox++"
-        while (left_pad-- > 0)
-          ptr += from_char_utf8(d_fill_char, ptr);
-        ptr = copy_string(ptr, d_str);
-        while (right_pad-- > 0)
-          ptr += from_char_utf8(d_fill_char, ptr);
+        if (d_strings.is_valid(idx)) {
+          pad_impl<side_type::BOTH>(
+            d_strings.element<string_view>(idx), width, d_fill_char, d_chars + d_offsets[idx]);
+        }
       });
   }
 
@@ -146,11 +127,10 @@ std::unique_ptr<column> pad(
                              std::move(null_mask));
 }
 
-std::unique_ptr<column> zfill(
-  strings_column_view const& input,
-  size_type width,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> zfill(strings_column_view const& input,
+                              size_type width,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -174,19 +154,10 @@ std::unique_ptr<column> zfill(
                      thrust::make_counting_iterator<cudf::size_type>(0),
                      input.size(),
                      [d_strings, width, d_offsets, d_chars] __device__(size_type idx) {
-                       if (d_strings.is_null(idx)) return;
-                       auto d_str   = d_strings.element<string_view>(idx);
-                       auto length  = d_str.length();
-                       auto in_ptr  = d_str.data();
-                       auto out_ptr = d_chars + d_offsets[idx];
-                       // if the string starts with a sign, output the sign first
-                       if (!d_str.empty() && (*in_ptr == '-' || *in_ptr == '+')) {
-                         *out_ptr++ = *in_ptr++;
-                         d_str      = string_view{in_ptr, d_str.size_bytes() - 1};
+                       if (d_strings.is_valid(idx)) {
+                         zfill_impl(
+                           d_strings.element<string_view>(idx), width, d_chars + d_offsets[idx]);
                        }
-                       while (length++ < width)
-                         *out_ptr++ = '0';  // prepend zero char
-                       copy_string(out_ptr, d_str);
                      });
 
   return make_strings_column(input.size(),
@@ -207,7 +178,7 @@ std::unique_ptr<column> pad(strings_column_view const& input,
                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::pad(input, width, side, fill_char, cudf::default_stream_value, mr);
+  return detail::pad(input, width, side, fill_char, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> zfill(strings_column_view const& input,
@@ -215,7 +186,7 @@ std::unique_ptr<column> zfill(strings_column_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::zfill(input, width, cudf::default_stream_value, mr);
+  return detail::zfill(input, width, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/regex/regcomp.cpp b/cpp/src/strings/regex/regcomp.cpp
index 5b86aedc409..0c0404f31ce 100644
--- a/cpp/src/strings/regex/regcomp.cpp
+++ b/cpp/src/strings/regex/regcomp.cpp
@@ -123,7 +123,7 @@ int32_t reprog::add_class(reclass const& cls)
 
 reinst& reprog::inst_at(int32_t id) { return _insts[id]; }
 
-reclass& reprog::class_at(int32_t id) { return _classes[id]; }
+reclass const& reprog::class_at(int32_t id) const { return _classes[id]; }
 
 void reprog::set_start_inst(int32_t id) { _startinst_id = id; }
 
diff --git a/cpp/src/strings/regex/regcomp.h b/cpp/src/strings/regex/regcomp.h
index 7ad7f481436..b450b3f90e7 100644
--- a/cpp/src/strings/regex/regcomp.h
+++ b/cpp/src/strings/regex/regcomp.h
@@ -128,7 +128,7 @@ class reprog {
   [[nodiscard]] reinst const* insts_data() const;
 
   [[nodiscard]] int32_t classes_count() const;
-  [[nodiscard]] reclass& class_at(int32_t id);
+  [[nodiscard]] reclass const& class_at(int32_t id) const;
   [[nodiscard]] reclass const* classes_data() const;
 
   [[nodiscard]] const int32_t* starts_data() const;
diff --git a/cpp/src/strings/regex/regex.cuh b/cpp/src/strings/regex/regex.cuh
index 98631680800..d16efb5f66e 100644
--- a/cpp/src/strings/regex/regex.cuh
+++ b/cpp/src/strings/regex/regex.cuh
@@ -25,6 +25,8 @@
 #include <thrust/optional.h>
 #include <thrust/pair.h>
 
+#include <cuda_runtime.h>
+
 #include <memory>
 
 namespace cudf {
@@ -56,6 +58,8 @@ struct alignas(16) reclass_device {
   __device__ inline bool is_match(char32_t const ch, uint8_t const* flags) const;
 };
 
+class reprog;
+
 /**
  * @brief Regex program of instructions/data for a specific regex pattern.
  *
@@ -78,32 +82,14 @@ class reprog_device {
   reprog_device& operator=(reprog_device&&) = default;
 
   /**
-   * @brief Create device program instance from a regex pattern.
-   *
-   * The number of strings is needed to compute the state data size required when evaluating the
-   * regex.
-   *
-   * @param pattern The regex pattern to compile.
-   * @param stream CUDA stream used for device memory operations and kernel launches.
-   * @return The program device object.
-   */
-  static std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> create(
-    std::string_view pattern, rmm::cuda_stream_view stream);
-
-  /**
-   * @brief Create the device program instance from a regex pattern
+   * @brief Create device program instance from a regex program
    *
-   * @param pattern The regex pattern to compile
-   * @param re_flags Regex flags for interpreting special characters in the pattern
-   * @param capture Control how capture groups are processed
+   * @param prog The regex program to create from
    * @param stream CUDA stream used for device memory operations and kernel launches
    * @return The program device object
    */
   static std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> create(
-    std::string_view pattern,
-    regex_flags const re_flags,
-    capture_groups const capture,
-    rmm::cuda_stream_view stream);
+    reprog const& prog, rmm::cuda_stream_view stream);
 
   /**
    * @brief Called automatically by the unique_ptr returned from create().
@@ -270,7 +256,7 @@ class reprog_device {
                                          cudf::size_type& end,
                                          cudf::size_type const group_id = 0) const;
 
-  reprog_device(reprog&);
+  reprog_device(reprog const&);
 
   int32_t _startinst_id;          // first instruction id
   int32_t _num_capturing_groups;  // instruction groups
@@ -289,6 +275,16 @@ class reprog_device {
   int32_t _thread_count{};   // threads available in working memory
 };
 
+/**
+ * @brief Return the size in bytes needed for working memory to
+ * execute insts_count instructions in parallel over num_threads threads.
+ *
+ * @param num_threads Number of parallel threads (usually one per string in a strings column)
+ * @param insts_count Number of instructions from a compiled regex pattern
+ * @return Number of bytes needed for working memory
+ */
+std::size_t compute_working_memory_size(int32_t num_threads, int32_t insts_count);
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/regex/regex_program.cpp b/cpp/src/strings/regex/regex_program.cpp
new file mode 100644
index 00000000000..c64da213fcf
--- /dev/null
+++ b/cpp/src/strings/regex/regex_program.cpp
@@ -0,0 +1,63 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "regex_program_impl.h"
+
+#include <cudf/strings/regex/regex_program.hpp>
+
+#include <memory>
+#include <string>
+
+namespace cudf {
+namespace strings {
+
+std::unique_ptr<regex_program> regex_program::create(std::string_view pattern,
+                                                     regex_flags flags,
+                                                     capture_groups capture)
+{
+  auto p = new regex_program(pattern, flags, capture);
+  return std::unique_ptr<regex_program>(p);
+}
+
+regex_program::~regex_program()                     = default;
+regex_program::regex_program(regex_program&& other) = default;
+regex_program& regex_program::operator=(regex_program&& other) = default;
+
+regex_program::regex_program(std::string_view pattern, regex_flags flags, capture_groups capture)
+  : _pattern(pattern),
+    _flags(flags),
+    _impl(
+      std::make_unique<regex_program_impl>(detail::reprog::create_from(pattern, flags, capture)))
+{
+}
+
+std::string regex_program::pattern() const { return _pattern; }
+
+regex_flags regex_program::flags() const { return _flags; }
+
+capture_groups regex_program::capture() const { return _capture; }
+
+int32_t regex_program::instructions_count() const { return _impl->prog.insts_count(); }
+
+int32_t regex_program::groups_count() const { return _impl->prog.groups_count(); }
+
+std::size_t regex_program::compute_working_memory_size(int32_t num_strings) const
+{
+  return detail::compute_working_memory_size(num_strings, instructions_count());
+}
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/regex/regex_program_impl.h b/cpp/src/strings/regex/regex_program_impl.h
new file mode 100644
index 00000000000..eede2225bce
--- /dev/null
+++ b/cpp/src/strings/regex/regex_program_impl.h
@@ -0,0 +1,50 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "regcomp.h"
+#include "regex.cuh"
+
+#include <cudf/strings/regex/regex_program.hpp>
+
+#include <rmm/cuda_stream_view.hpp>
+
+namespace cudf {
+namespace strings {
+
+/**
+ * @brief Implementation object for regex_program
+ *
+ * It encapsulates internal reprog object used for building its device equivalent
+ */
+struct regex_program::regex_program_impl {
+  detail::reprog prog;
+
+  regex_program_impl(detail::reprog const& p) : prog(p) {}
+  regex_program_impl(detail::reprog&& p) : prog(p) {}
+
+  // TODO: There will be other options added here in the future to handle issues
+  // 10852 and possibly others like 11979
+};
+
+struct regex_device_builder {
+  static auto create_prog_device(regex_program const& p, rmm::cuda_stream_view stream)
+  {
+    return detail::reprog_device::create(p._impl->prog, stream);
+  }
+};
+
+}  // namespace strings
+}  // namespace cudf
diff --git a/cpp/src/strings/regex/regexec.cu b/cpp/src/strings/regex/regexec.cpp
similarity index 90%
rename from cpp/src/strings/regex/regexec.cu
rename to cpp/src/strings/regex/regexec.cpp
index 03247d24ba3..febad651f69 100644
--- a/cpp/src/strings/regex/regexec.cu
+++ b/cpp/src/strings/regex/regexec.cpp
@@ -33,7 +33,7 @@ namespace strings {
 namespace detail {
 
 // Copy reprog primitive values
-reprog_device::reprog_device(reprog& prog)
+reprog_device::reprog_device(reprog const& prog)
   : _startinst_id{prog.get_start_inst()},
     _num_capturing_groups{prog.groups_count()},
     _insts_count{prog.insts_count()},
@@ -45,22 +45,8 @@ reprog_device::reprog_device(reprog& prog)
 }
 
 std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_device::create(
-  std::string_view pattern, rmm::cuda_stream_view stream)
+  reprog const& h_prog, rmm::cuda_stream_view stream)
 {
-  return reprog_device::create(
-    pattern, regex_flags::MULTILINE, capture_groups::NON_CAPTURE, stream);
-}
-
-// Create instance of the reprog that can be passed into a device kernel
-std::unique_ptr<reprog_device, std::function<void(reprog_device*)>> reprog_device::create(
-  std::string_view pattern,
-  regex_flags const flags,
-  capture_groups const capture,
-  rmm::cuda_stream_view stream)
-{
-  // compile pattern into host object
-  reprog h_prog = reprog::create_from(pattern, flags, capture);
-
   // compute size to hold all the member data
   auto const insts_count   = h_prog.insts_count();
   auto const classes_count = h_prog.classes_count();
@@ -144,7 +130,7 @@ void reprog_device::destroy() { delete this; }
 
 std::size_t reprog_device::working_memory_size(int32_t num_threads) const
 {
-  return relist::alloc_size(_insts_count, num_threads) * 2;
+  return compute_working_memory_size(num_threads, insts_counts());
 }
 
 std::pair<std::size_t, int32_t> reprog_device::compute_strided_working_memory(
@@ -176,6 +162,11 @@ int32_t reprog_device::compute_shared_memory_size() const
   return _prog_size < MAX_SHARED_MEM ? static_cast<int32_t>(_prog_size) : 0;
 }
 
+std::size_t compute_working_memory_size(int32_t num_threads, int32_t insts_count)
+{
+  return relist::alloc_size(insts_count, num_threads) * 2;
+}
+
 }  // namespace detail
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/repeat_strings.cu b/cpp/src/strings/repeat_strings.cu
index 959229bbb87..5d02069d7f3 100644
--- a/cpp/src/strings/repeat_strings.cu
+++ b/cpp/src/strings/repeat_strings.cu
@@ -385,7 +385,7 @@ std::unique_ptr<string_scalar> repeat_string(string_scalar const& input,
                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_string(input, repeat_times, cudf::default_stream_value, mr);
+  return detail::repeat_string(input, repeat_times, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
@@ -393,7 +393,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_strings(input, repeat_times, cudf::default_stream_value, mr);
+  return detail::repeat_strings(input, repeat_times, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> repeat_strings(strings_column_view const& input,
@@ -403,7 +403,7 @@ std::unique_ptr<column> repeat_strings(strings_column_view const& input,
 {
   CUDF_FUNC_RANGE();
   return detail::repeat_strings(
-    input, repeat_times, output_strings_sizes, cudf::default_stream_value, mr);
+    input, repeat_times, output_strings_sizes, cudf::get_default_stream(), mr);
 }
 
 std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
@@ -412,7 +412,7 @@ std::pair<std::unique_ptr<column>, int64_t> repeat_strings_output_sizes(
   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::repeat_strings_output_sizes(input, repeat_times, cudf::default_stream_value, mr);
+  return detail::repeat_strings_output_sizes(input, repeat_times, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/backref_re.cu b/cpp/src/strings/replace/backref_re.cu
index e0a995c26b9..383337c9088 100644
--- a/cpp/src/strings/replace/backref_re.cu
+++ b/cpp/src/strings/replace/backref_re.cu
@@ -16,6 +16,7 @@
 
 #include "backref_re.cuh"
 
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -102,19 +103,18 @@ std::pair<std::string, std::vector<backref_type>> parse_backrefs(std::string_vie
 
 //
 std::unique_ptr<column> replace_with_backrefs(strings_column_view const& input,
-                                              std::string_view pattern,
+                                              regex_program const& prog,
                                               std::string_view replacement,
-                                              regex_flags const flags,
                                               rmm::cuda_stream_view stream,
                                               rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
-  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+  CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
   CUDF_EXPECTS(!replacement.empty(), "Parameter replacement must not be empty");
 
-  // compile regex into device object
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::EXTRACT, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   // parse the repl string for back-ref indicators
   auto group_count = std::min(99, d_prog->group_counts());  // group count should NOT exceed 99
@@ -152,8 +152,18 @@ std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::EXTRACT);
   return detail::replace_with_backrefs(
-    strings, pattern, replacement, flags, cudf::default_stream_value, mr);
+    strings, *h_prog, replacement, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> replace_with_backrefs(strings_column_view const& strings,
+                                              regex_program const& prog,
+                                              std::string_view replacement,
+                                              rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::replace_with_backrefs(strings, prog, replacement, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/multi_re.cu b/cpp/src/strings/replace/multi_re.cu
index a5b9ad37e65..fcc24f36b5c 100644
--- a/cpp/src/strings/replace/multi_re.cu
+++ b/cpp/src/strings/replace/multi_re.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/regex/regex.cuh>
+#include <strings/regex/regex_program_impl.h>
 
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_device_view.cuh>
@@ -125,13 +126,12 @@ struct replace_multi_regex_fn {
 
 }  // namespace
 
-std::unique_ptr<column> replace_re(
-  strings_column_view const& input,
-  std::vector<std::string> const& patterns,
-  strings_column_view const& replacements,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> replace_re(strings_column_view const& input,
+                                   std::vector<std::string> const& patterns,
+                                   strings_column_view const& replacements,
+                                   regex_flags const flags,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) { return make_empty_column(type_id::STRING); }
   if (patterns.empty()) {  // if no patterns; just return a copy
@@ -145,7 +145,8 @@ std::unique_ptr<column> replace_re(
     patterns.size());
   std::transform(
     patterns.begin(), patterns.end(), h_progs.begin(), [flags, stream](auto const& ptn) {
-      return reprog_device::create(ptn, flags, capture_groups::NON_CAPTURE, stream);
+      auto h_prog = regex_program::create(ptn, flags, capture_groups::NON_CAPTURE);
+      return regex_device_builder::create_prog_device(*h_prog, stream);
     });
 
   // get the longest regex for the dispatcher
@@ -198,7 +199,7 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_re(strings, patterns, replacements, flags, cudf::default_stream_value, mr);
+  return detail::replace_re(strings, patterns, replacements, flags, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/replace.cu b/cpp/src/strings/replace/replace.cu
index de875014054..1cb7de5dc3b 100644
--- a/cpp/src/strings/replace/replace.cu
+++ b/cpp/src/strings/replace/replace.cu
@@ -843,7 +843,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace(strings, target, repl, maxrepl, cudf::default_stream_value, mr);
+  return detail::replace(strings, target, repl, maxrepl, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> replace_slice(strings_column_view const& strings,
@@ -853,7 +853,7 @@ std::unique_ptr<column> replace_slice(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace_slice(strings, repl, start, stop, cudf::default_stream_value, mr);
+  return detail::replace_slice(strings, repl, start, stop, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> replace(strings_column_view const& strings,
@@ -862,7 +862,7 @@ std::unique_ptr<column> replace(strings_column_view const& strings,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::replace(strings, targets, repls, cudf::default_stream_value, mr);
+  return detail::replace(strings, targets, repls, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/replace/replace_re.cu b/cpp/src/strings/replace/replace_re.cu
index fd0049d7c89..0e2f3169e8e 100644
--- a/cpp/src/strings/replace/replace_re.cu
+++ b/cpp/src/strings/replace/replace_re.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -100,22 +101,20 @@ struct replace_regex_fn {
 }  // namespace
 
 //
-std::unique_ptr<column> replace_re(
-  strings_column_view const& input,
-  std::string_view pattern,
-  string_scalar const& replacement,
-  std::optional<size_type> max_replace_count,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> replace_re(strings_column_view const& input,
+                                   regex_program const& prog,
+                                   string_scalar const& replacement,
+                                   std::optional<size_type> max_replace_count,
+                                   rmm::cuda_stream_view stream,
+                                   rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
   CUDF_EXPECTS(replacement.is_valid(stream), "Parameter replacement must be valid");
   string_view d_repl(replacement.data(), replacement.size());
 
-  // compile regex into device object
-  auto d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   auto const maxrepl = max_replace_count.value_or(-1);
 
@@ -141,10 +140,22 @@ std::unique_ptr<column> replace_re(strings_column_view const& strings,
                                    std::optional<size_type> max_replace_count,
                                    regex_flags const flags,
                                    rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE);
+  return detail::replace_re(
+    strings, *h_prog, replacement, max_replace_count, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> replace_re(strings_column_view const& strings,
+                                   regex_program const& prog,
+                                   string_scalar const& replacement,
+                                   std::optional<size_type> max_replace_count,
+                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
   return detail::replace_re(
-    strings, pattern, replacement, max_replace_count, flags, cudf::default_stream_value, mr);
+    strings, prog, replacement, max_replace_count, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/search/find.cu b/cpp/src/strings/search/find.cu
index fa8581558a0..e6384d5d6e1 100644
--- a/cpp/src/strings/search/find.cu
+++ b/cpp/src/strings/search/find.cu
@@ -102,13 +102,12 @@ std::unique_ptr<column> find_fn(strings_column_view const& strings,
 
 }  // namespace
 
-std::unique_ptr<column> find(
-  strings_column_view const& strings,
-  string_scalar const& target,
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> find(strings_column_view const& strings,
+                             string_scalar const& target,
+                             size_type start,
+                             size_type stop,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(
                string_view d_string, string_view d_target, size_type start, size_type stop) {
@@ -122,13 +121,12 @@ std::unique_ptr<column> find(
   return find_fn(strings, target, start, stop, pfn, stream, mr);
 }
 
-std::unique_ptr<column> rfind(
-  strings_column_view const& strings,
-  string_scalar const& target,
-  size_type start                     = 0,
-  size_type stop                      = -1,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> rfind(strings_column_view const& strings,
+                              string_scalar const& target,
+                              size_type start,
+                              size_type stop,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(
                string_view d_string, string_view d_target, size_type start, size_type stop) {
@@ -153,7 +151,7 @@ std::unique_ptr<column> find(strings_column_view const& strings,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::find(strings, target, start, stop, cudf::default_stream_value, mr);
+  return detail::find(strings, target, start, stop, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> rfind(strings_column_view const& strings,
@@ -163,7 +161,7 @@ std::unique_ptr<column> rfind(strings_column_view const& strings,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rfind(strings, target, start, stop, cudf::default_stream_value, mr);
+  return detail::rfind(strings, target, start, stop, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -366,11 +364,10 @@ std::unique_ptr<column> contains_fn(strings_column_view const& strings,
 
 }  // namespace
 
-std::unique_ptr<column> contains(
-  strings_column_view const& input,
-  string_scalar const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> contains(strings_column_view const& input,
+                                 string_scalar const& target,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   // use warp parallel when the average string width is greater than the threshold
   if (!input.is_empty() && ((input.chars_size() / input.size()) > AVG_CHAR_BYTES_THRESHOLD)) {
@@ -384,11 +381,10 @@ std::unique_ptr<column> contains(
   return contains_fn(input, target, pfn, stream, mr);
 }
 
-std::unique_ptr<column> contains(
-  strings_column_view const& strings,
-  strings_column_view const& targets,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> contains(strings_column_view const& strings,
+                                 strings_column_view const& targets,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return d_string.find(d_target) != string_view::npos;
@@ -396,11 +392,10 @@ std::unique_ptr<column> contains(
   return contains_fn(strings, targets, pfn, stream, mr);
 }
 
-std::unique_ptr<column> starts_with(
-  strings_column_view const& strings,
-  string_scalar const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> starts_with(strings_column_view const& strings,
+                                    string_scalar const& target,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return (d_target.size_bytes() <= d_string.size_bytes()) &&
@@ -409,11 +404,10 @@ std::unique_ptr<column> starts_with(
   return contains_fn(strings, target, pfn, stream, mr);
 }
 
-std::unique_ptr<column> starts_with(
-  strings_column_view const& strings,
-  strings_column_view const& targets,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> starts_with(strings_column_view const& strings,
+                                    strings_column_view const& targets,
+                                    rmm::cuda_stream_view stream,
+                                    rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     return (d_target.size_bytes() <= d_string.size_bytes()) &&
@@ -422,11 +416,10 @@ std::unique_ptr<column> starts_with(
   return contains_fn(strings, targets, pfn, stream, mr);
 }
 
-std::unique_ptr<column> ends_with(
-  strings_column_view const& strings,
-  string_scalar const& target,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> ends_with(strings_column_view const& strings,
+                                  string_scalar const& target,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto const str_size = d_string.size_bytes();
@@ -438,11 +431,10 @@ std::unique_ptr<column> ends_with(
   return contains_fn(strings, target, pfn, stream, mr);
 }
 
-std::unique_ptr<column> ends_with(
-  strings_column_view const& strings,
-  strings_column_view const& targets,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> ends_with(strings_column_view const& strings,
+                                  strings_column_view const& targets,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   auto pfn = [] __device__(string_view d_string, string_view d_target) {
     auto const str_size = d_string.size_bytes();
@@ -463,7 +455,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(strings, target, cudf::default_stream_value, mr);
+  return detail::contains(strings, target, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> contains(strings_column_view const& strings,
@@ -471,7 +463,7 @@ std::unique_ptr<column> contains(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::contains(strings, targets, cudf::default_stream_value, mr);
+  return detail::contains(strings, targets, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
@@ -479,7 +471,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::starts_with(strings, target, cudf::default_stream_value, mr);
+  return detail::starts_with(strings, target, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> starts_with(strings_column_view const& strings,
@@ -487,7 +479,7 @@ std::unique_ptr<column> starts_with(strings_column_view const& strings,
                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::starts_with(strings, targets, cudf::default_stream_value, mr);
+  return detail::starts_with(strings, targets, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
@@ -495,7 +487,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ends_with(strings, target, cudf::default_stream_value, mr);
+  return detail::ends_with(strings, target, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> ends_with(strings_column_view const& strings,
@@ -503,7 +495,7 @@ std::unique_ptr<column> ends_with(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::ends_with(strings, targets, cudf::default_stream_value, mr);
+  return detail::ends_with(strings, targets, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/search/find_multiple.cu b/cpp/src/strings/search/find_multiple.cu
index 1e0f26b8650..1907c0d749b 100644
--- a/cpp/src/strings/search/find_multiple.cu
+++ b/cpp/src/strings/search/find_multiple.cu
@@ -34,11 +34,10 @@
 namespace cudf {
 namespace strings {
 namespace detail {
-std::unique_ptr<column> find_multiple(
-  strings_column_view const& input,
-  strings_column_view const& targets,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> find_multiple(strings_column_view const& input,
+                                      strings_column_view const& targets,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   auto const strings_count = input.size();
   auto const targets_count = targets.size();
@@ -92,7 +91,7 @@ std::unique_ptr<column> find_multiple(strings_column_view const& input,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::find_multiple(input, targets, cudf::default_stream_value, mr);
+  return detail::find_multiple(input, targets, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/search/findall.cu b/cpp/src/strings/search/findall.cu
index 73470bde867..6ab1b3e726b 100644
--- a/cpp/src/strings/search/findall.cu
+++ b/cpp/src/strings/search/findall.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -92,18 +93,16 @@ std::unique_ptr<column> findall_util(column_device_view const& d_strings,
 }  // namespace
 
 //
-std::unique_ptr<column> findall(
-  strings_column_view const& input,
-  std::string_view pattern,
-  regex_flags const flags,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> findall(strings_column_view const& input,
+                                regex_program const& prog,
+                                rmm::cuda_stream_view stream,
+                                rmm::mr::device_memory_resource* mr)
 {
   auto const strings_count = input.size();
   auto const d_strings     = column_device_view::create(input.parent(), stream);
 
-  // compile regex into device object
-  auto const d_prog = reprog_device::create(pattern, flags, capture_groups::NON_CAPTURE, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
 
   // Create lists offsets column
   auto offsets   = count_matches(*d_strings, *d_prog, strings_count + 1, stream, mr);
@@ -139,7 +138,16 @@ std::unique_ptr<column> findall(strings_column_view const& input,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::findall(input, pattern, flags, cudf::default_stream_value, mr);
+  auto const h_prog = regex_program::create(pattern, flags, capture_groups::NON_CAPTURE);
+  return detail::findall(input, *h_prog, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> findall(strings_column_view const& input,
+                                regex_program const& prog,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::findall(input, prog, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/partition.cu b/cpp/src/strings/split/partition.cu
index 161c48383ff..09aadb78554 100644
--- a/cpp/src/strings/split/partition.cu
+++ b/cpp/src/strings/split/partition.cu
@@ -181,11 +181,10 @@ struct rpartition_fn : public partition_fn {
 
 }  // namespace
 
-std::unique_ptr<table> partition(
-  strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> partition(strings_column_view const& strings,
+                                 string_scalar const& delimiter,
+                                 rmm::cuda_stream_view stream,
+                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -209,11 +208,10 @@ std::unique_ptr<table> partition(
   return std::make_unique<table>(std::move(results));
 }
 
-std::unique_ptr<table> rpartition(
-  strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> rpartition(strings_column_view const& strings,
+                                  string_scalar const& delimiter,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   auto strings_count = strings.size();
@@ -246,7 +244,7 @@ std::unique_ptr<table> partition(strings_column_view const& strings,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::partition(strings, delimiter, cudf::default_stream_value, mr);
+  return detail::partition(strings, delimiter, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> rpartition(strings_column_view const& strings,
@@ -254,7 +252,7 @@ std::unique_ptr<table> rpartition(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rpartition(strings, delimiter, cudf::default_stream_value, mr);
+  return detail::rpartition(strings, delimiter, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/split.cu b/cpp/src/strings/split/split.cu
index 000029063e0..c11d7ad47f9 100644
--- a/cpp/src/strings/split/split.cu
+++ b/cpp/src/strings/split/split.cu
@@ -791,12 +791,11 @@ std::unique_ptr<table> whitespace_split_fn(size_type strings_count,
 
 }  // namespace
 
-std::unique_ptr<table> split(
-  strings_column_view const& strings_column,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> split(strings_column_view const& strings_column,
+                             string_scalar const& delimiter,
+                             size_type maxsplit,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -816,12 +815,11 @@ std::unique_ptr<table> split(
     strings_column, split_tokenizer_fn{*strings_device_view, d_delimiter, max_tokens}, stream, mr);
 }
 
-std::unique_ptr<table> rsplit(
-  strings_column_view const& strings_column,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
+                              string_scalar const& delimiter,
+                              size_type maxsplit,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -851,7 +849,7 @@ std::unique_ptr<table> split(strings_column_view const& strings_column,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split(strings_column, delimiter, maxsplit, cudf::default_stream_value, mr);
+  return detail::split(strings_column, delimiter, maxsplit, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
@@ -860,7 +858,7 @@ std::unique_ptr<table> rsplit(strings_column_view const& strings_column,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit(strings_column, delimiter, maxsplit, cudf::default_stream_value, mr);
+  return detail::rsplit(strings_column, delimiter, maxsplit, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/split/split_re.cu b/cpp/src/strings/split/split_re.cu
index e8de1da0d83..fdd46300820 100644
--- a/cpp/src/strings/split/split_re.cu
+++ b/cpp/src/strings/split/split_re.cu
@@ -15,6 +15,7 @@
  */
 
 #include <strings/count_matches.hpp>
+#include <strings/regex/regex_program_impl.h>
 #include <strings/regex/utilities.cuh>
 
 #include <cudf/column/column.hpp>
@@ -184,13 +185,13 @@ struct tokens_transform_fn {
 };
 
 std::unique_ptr<table> split_re(strings_column_view const& input,
-                                std::string_view pattern,
+                                regex_program const& prog,
                                 split_direction direction,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+  CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
 
   auto const strings_count = input.size();
 
@@ -200,12 +201,14 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
     return std::make_unique<table>(std::move(results));
   }
 
-  // create the regex device prog from the given pattern
-  auto d_prog    = reprog_device::create(pattern, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
+
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
-  auto offsets      = count_matches(*d_strings, *d_prog, strings_count + 1, stream);
+  auto offsets = count_matches(
+    *d_strings, *d_prog, strings_count + 1, stream, rmm::mr::get_current_device_resource());
   auto offsets_view = offsets->mutable_view();
   auto d_offsets    = offsets_view.data<offset_type>();
 
@@ -252,18 +255,19 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
 }
 
 std::unique_ptr<column> split_record_re(strings_column_view const& input,
-                                        std::string_view pattern,
+                                        regex_program const& prog,
                                         split_direction direction,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  CUDF_EXPECTS(!pattern.empty(), "Parameter pattern must not be empty");
+  CUDF_EXPECTS(!prog.pattern().empty(), "Parameter pattern must not be empty");
 
   auto const strings_count = input.size();
 
-  // create the regex device prog from the given pattern
-  auto d_prog    = reprog_device::create(pattern, stream);
+  // create device object from regex_program
+  auto d_prog = regex_device_builder::create_prog_device(prog, stream);
+
   auto d_strings = column_device_view::create(input.parent(), stream);
 
   // count the number of delimiters matched in each string
@@ -289,39 +293,39 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
 }  // namespace
 
 std::unique_ptr<table> split_re(strings_column_view const& input,
-                                std::string_view pattern,
+                                regex_program const& prog,
                                 size_type maxsplit,
                                 rmm::cuda_stream_view stream,
                                 rmm::mr::device_memory_resource* mr)
 {
-  return split_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr);
+  return split_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr);
 }
 
 std::unique_ptr<column> split_record_re(strings_column_view const& input,
-                                        std::string_view pattern,
+                                        regex_program const& prog,
                                         size_type maxsplit,
                                         rmm::cuda_stream_view stream,
                                         rmm::mr::device_memory_resource* mr)
 {
-  return split_record_re(input, pattern, split_direction::FORWARD, maxsplit, stream, mr);
+  return split_record_re(input, prog, split_direction::FORWARD, maxsplit, stream, mr);
 }
 
 std::unique_ptr<table> rsplit_re(strings_column_view const& input,
-                                 std::string_view pattern,
+                                 regex_program const& prog,
                                  size_type maxsplit,
                                  rmm::cuda_stream_view stream,
                                  rmm::mr::device_memory_resource* mr)
 {
-  return split_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr);
+  return split_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr);
 }
 
 std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
-                                         std::string_view pattern,
+                                         regex_program const& prog,
                                          size_type maxsplit,
                                          rmm::cuda_stream_view stream,
                                          rmm::mr::device_memory_resource* mr)
 {
-  return split_record_re(input, pattern, split_direction::BACKWARD, maxsplit, stream, mr);
+  return split_record_re(input, prog, split_direction::BACKWARD, maxsplit, stream, mr);
 }
 
 }  // namespace detail
@@ -334,7 +338,17 @@ std::unique_ptr<table> split_re(strings_column_view const& input,
                                 rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_re(input, pattern, maxsplit, cudf::default_stream_value, mr);
+  auto const h_prog = regex_program::create(pattern);
+  return detail::split_re(input, *h_prog, maxsplit, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<table> split_re(strings_column_view const& input,
+                                regex_program const& prog,
+                                size_type maxsplit,
+                                rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> split_record_re(strings_column_view const& input,
@@ -343,7 +357,17 @@ std::unique_ptr<column> split_record_re(strings_column_view const& input,
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::split_record_re(input, pattern, maxsplit, cudf::default_stream_value, mr);
+  auto const h_prog = regex_program::create(pattern);
+  return detail::split_record_re(input, *h_prog, maxsplit, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<column> split_record_re(strings_column_view const& input,
+                                        regex_program const& prog,
+                                        size_type maxsplit,
+                                        rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::split_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<table> rsplit_re(strings_column_view const& input,
@@ -352,7 +376,17 @@ std::unique_ptr<table> rsplit_re(strings_column_view const& input,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit_re(input, pattern, maxsplit, cudf::default_stream_value, mr);
+  auto const h_prog = regex_program::create(pattern);
+  return detail::rsplit_re(input, *h_prog, maxsplit, cudf::get_default_stream(), mr);
+}
+
+std::unique_ptr<table> rsplit_re(strings_column_view const& input,
+                                 regex_program const& prog,
+                                 size_type maxsplit,
+                                 rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::rsplit_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
@@ -361,7 +395,18 @@ std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::rsplit_record_re(input, pattern, maxsplit, cudf::default_stream_value, mr);
+  auto const h_prog = regex_program::create(pattern);
+  return detail::rsplit_record_re(input, *h_prog, maxsplit, cudf::get_default_stream(), mr);
 }
+
+std::unique_ptr<column> rsplit_record_re(strings_column_view const& input,
+                                         regex_program const& prog,
+                                         size_type maxsplit,
+                                         rmm::mr::device_memory_resource* mr)
+{
+  CUDF_FUNC_RANGE();
+  return detail::rsplit_record_re(input, prog, maxsplit, cudf::get_default_stream(), mr);
+}
+
 }  // namespace strings
 }  // namespace cudf
diff --git a/cpp/src/strings/split/split_record.cu b/cpp/src/strings/split/split_record.cu
index 60c09ffd93a..d935ad0b1da 100644
--- a/cpp/src/strings/split/split_record.cu
+++ b/cpp/src/strings/split/split_record.cu
@@ -264,12 +264,11 @@ std::unique_ptr<column> split_record_fn(strings_column_view const& strings,
 }
 
 template <Dir dir>
-std::unique_ptr<column> split_record(
-  strings_column_view const& strings,
-  string_scalar const& delimiter      = string_scalar(""),
-  size_type maxsplit                  = -1,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> split_record(strings_column_view const& strings,
+                                     string_scalar const& delimiter,
+                                     size_type maxsplit,
+                                     rmm::cuda_stream_view stream,
+                                     rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
 
@@ -304,7 +303,7 @@ std::unique_ptr<column> split_record(strings_column_view const& strings,
 {
   CUDF_FUNC_RANGE();
   return detail::split_record<detail::Dir::FORWARD>(
-    strings, delimiter, maxsplit, cudf::default_stream_value, mr);
+    strings, delimiter, maxsplit, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
@@ -314,7 +313,7 @@ std::unique_ptr<column> rsplit_record(strings_column_view const& strings,
 {
   CUDF_FUNC_RANGE();
   return detail::split_record<detail::Dir::BACKWARD>(
-    strings, delimiter, maxsplit, cudf::default_stream_value, mr);
+    strings, delimiter, maxsplit, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/strings_column_factories.cu b/cpp/src/strings/strings_column_factories.cu
index f5188ce1354..2159b67774e 100644
--- a/cpp/src/strings/strings_column_factories.cu
+++ b/cpp/src/strings/strings_column_factories.cu
@@ -56,13 +56,12 @@ std::unique_ptr<column> make_strings_column(
   return cudf::strings::detail::make_strings_column(strings.begin(), strings.end(), stream, mr);
 }
 
-std::unique_ptr<column> make_strings_column(
-  device_span<char> chars,
-  device_span<size_type> offsets,
-  size_type null_count,
-  rmm::device_buffer&& null_mask,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> make_strings_column(device_span<char> chars,
+                                            device_span<size_type> offsets,
+                                            size_type null_count,
+                                            rmm::device_buffer&& null_mask,
+                                            rmm::cuda_stream_view stream,
+                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
 
diff --git a/cpp/src/strings/strip.cu b/cpp/src/strings/strip.cu
index 5d51a5a7bed..6fb7c671a87 100644
--- a/cpp/src/strings/strip.cu
+++ b/cpp/src/strings/strip.cu
@@ -15,11 +15,9 @@
  */
 
 #include <cudf/column/column_device_view.cuh>
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
 #include <cudf/detail/nvtx/ranges.hpp>
+#include <cudf/strings/detail/strings_column_factories.cuh>
 #include <cudf/strings/detail/strip.cuh>
-#include <cudf/strings/detail/utilities.cuh>
 #include <cudf/strings/string_view.cuh>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/strip.hpp>
@@ -35,46 +33,34 @@ namespace detail {
 namespace {
 
 /**
- * @brief Strip characters from the beginning and/or end of a string.
+ * @brief Strip characters from the beginning and/or end of a string
  *
  * This functor strips the beginning and/or end of each string
  * of any characters found in d_to_strip or whitespace if
  * d_to_strip is empty.
  *
  */
-struct strip_fn {
+struct strip_transform_fn {
   column_device_view const d_strings;
   side_type const side;  // right, left, or both
   string_view const d_to_strip;
-  int32_t* d_offsets{};
-  char* d_chars{};
 
-  __device__ void operator()(size_type idx)
+  __device__ string_index_pair operator()(size_type idx)
   {
-    if (d_strings.is_null(idx)) {
-      if (!d_chars) d_offsets[idx] = 0;
-      return;
-    }
-
-    auto const d_str = d_strings.element<string_view>(idx);
-
+    if (d_strings.is_null(idx)) { return string_index_pair{nullptr, 0}; }
+    auto const d_str      = d_strings.element<string_view>(idx);
     auto const d_stripped = strip(d_str, d_to_strip, side);
-    if (d_chars) {
-      copy_string(d_chars + d_offsets[idx], d_stripped);
-    } else {
-      d_offsets[idx] = d_stripped.size_bytes();
-    }
+    return string_index_pair{d_stripped.data(), d_stripped.size_bytes()};
   }
 };
 
 }  // namespace
 
-std::unique_ptr<column> strip(
-  strings_column_view const& input,
-  side_type side                      = side_type::BOTH,
-  string_scalar const& to_strip       = string_scalar(""),
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> strip(strings_column_view const& input,
+                              side_type side,
+                              string_scalar const& to_strip,
+                              rmm::cuda_stream_view stream,
+                              rmm::mr::device_memory_resource* mr)
 {
   if (input.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -83,15 +69,14 @@ std::unique_ptr<column> strip(
 
   auto const d_column = column_device_view::create(input.parent(), stream);
 
-  // this utility calls the strip_fn to build the offsets and chars columns
-  auto children = cudf::strings::detail::make_strings_children(
-    strip_fn{*d_column, side, d_to_strip}, input.size(), stream, mr);
+  auto result = rmm::device_uvector<string_index_pair>(input.size(), stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    thrust::counting_iterator<size_type>(0),
+                    thrust::counting_iterator<size_type>(input.size()),
+                    result.begin(),
+                    strip_transform_fn{*d_column, side, d_to_strip});
 
-  return make_strings_column(input.size(),
-                             std::move(children.first),
-                             std::move(children.second),
-                             input.null_count(),
-                             cudf::detail::copy_bitmask(input.parent(), stream, mr));
+  return make_strings_column(result.begin(), result.end(), stream, mr);
 }
 
 }  // namespace detail
@@ -104,7 +89,7 @@ std::unique_ptr<column> strip(strings_column_view const& input,
                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::strip(input, side, to_strip, cudf::default_stream_value, mr);
+  return detail::strip(input, side, to_strip, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/substring.cu b/cpp/src/strings/substring.cu
index 271a5375915..2acc834a1cb 100644
--- a/cpp/src/strings/substring.cu
+++ b/cpp/src/strings/substring.cu
@@ -105,13 +105,12 @@ struct substring_fn {
 }  // namespace
 
 //
-std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
-  numeric_scalar<size_type> const& start = numeric_scalar<size_type>(0, false),
-  numeric_scalar<size_type> const& stop  = numeric_scalar<size_type>(0, false),
-  numeric_scalar<size_type> const& step  = numeric_scalar<size_type>(1),
-  rmm::cuda_stream_view stream           = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr    = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+                                      numeric_scalar<size_type> const& start,
+                                      numeric_scalar<size_type> const& stop,
+                                      numeric_scalar<size_type> const& step,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -143,7 +142,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, start, stop, step, cudf::default_stream_value, mr);
+  return detail::slice_strings(strings, start, stop, step, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -291,12 +290,11 @@ void compute_substring_indices(column_device_view const& d_column,
 }  // namespace
 
 //
-std::unique_ptr<column> slice_strings(
-  strings_column_view const& strings,
-  column_view const& starts_column,
-  column_view const& stops_column,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> slice_strings(strings_column_view const& strings,
+                                      column_view const& starts_column,
+                                      column_view const& stops_column,
+                                      rmm::cuda_stream_view stream,
+                                      rmm::mr::device_memory_resource* mr)
 {
   size_type strings_count = strings.size();
   if (strings_count == 0) return make_empty_column(type_id::STRING);
@@ -398,7 +396,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
 {
   CUDF_FUNC_RANGE();
   return detail::slice_strings(
-    strings, starts_column, stops_column, cudf::default_stream_value, mr);
+    strings, starts_column, stops_column, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> slice_strings(strings_column_view const& strings,
@@ -410,7 +408,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
   return detail::slice_strings(strings,
                                cudf::detail::make_pair_iterator<string_view>(delimiter),
                                count,
-                               cudf::default_stream_value,
+                               cudf::get_default_stream(),
                                mr);
 }
 
@@ -420,7 +418,7 @@ std::unique_ptr<column> slice_strings(strings_column_view const& strings,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::slice_strings(strings, delimiters, count, cudf::default_stream_value, mr);
+  return detail::slice_strings(strings, delimiters, count, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/translate.cu b/cpp/src/strings/translate.cu
index 94e4d313109..5b23b092cce 100644
--- a/cpp/src/strings/translate.cu
+++ b/cpp/src/strings/translate.cu
@@ -86,11 +86,10 @@ struct translate_fn {
 }  // namespace
 
 //
-std::unique_ptr<column> translate(
-  strings_column_view const& strings,
-  std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> translate(strings_column_view const& strings,
+                                  std::vector<std::pair<char_utf8, char_utf8>> const& chars_table,
+                                  rmm::cuda_stream_view stream,
+                                  rmm::mr::device_memory_resource* mr)
 {
   if (strings.is_empty()) return make_empty_column(type_id::STRING);
 
@@ -130,7 +129,7 @@ std::unique_ptr<column> translate(strings_column_view const& strings,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::translate(strings, chars_table, cudf::default_stream_value, mr);
+  return detail::translate(strings, chars_table, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/strings/wrap.cu b/cpp/src/strings/wrap.cu
index 232e61c1965..335908d65d1 100644
--- a/cpp/src/strings/wrap.cu
+++ b/cpp/src/strings/wrap.cu
@@ -91,11 +91,10 @@ struct execute_wrap {
 }  // namespace
 
 template <typename device_execute_functor>
-std::unique_ptr<column> wrap(
-  strings_column_view const& strings,
-  size_type width,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<column> wrap(strings_column_view const& strings,
+                             size_type width,
+                             rmm::cuda_stream_view stream,
+                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(width > 0, "Positive wrap width required");
 
@@ -137,7 +136,7 @@ std::unique_ptr<column> wrap(strings_column_view const& strings,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::wrap<detail::execute_wrap>(strings, width, cudf::default_stream_value, mr);
+  return detail::wrap<detail::execute_wrap>(strings, width, cudf::get_default_stream(), mr);
 }
 
 }  // namespace strings
diff --git a/cpp/src/structs/utilities.cpp b/cpp/src/structs/utilities.cpp
index bf4216b6983..c924163daf2 100644
--- a/cpp/src/structs/utilities.cpp
+++ b/cpp/src/structs/utilities.cpp
@@ -118,8 +118,8 @@ struct table_flattener {
    */
   void superimpose_nulls(table_view const& input_table)
   {
-    auto [table, null_masks]     = superimpose_parent_nulls(input_table);
-    this->input                  = table;
+    auto [table, null_masks] = superimpose_parent_nulls(input_table, cudf::get_default_stream());
+    this->input              = table;
     this->superimposed_nullmasks = std::move(null_masks);
   }
 
diff --git a/cpp/src/text/detokenize.cu b/cpp/src/text/detokenize.cu
index 16cc8f4922d..5e86a7ca1f3 100644
--- a/cpp/src/text/detokenize.cu
+++ b/cpp/src/text/detokenize.cu
@@ -201,7 +201,7 @@ std::unique_ptr<cudf::column> detokenize(cudf::strings_column_view const& string
                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::detokenize(strings, row_indices, separator, cudf::default_stream_value, mr);
+  return detail::detokenize(strings, row_indices, separator, cudf::get_default_stream(), mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/edit_distance.cu b/cpp/src/text/edit_distance.cu
index 18658433d6c..fb0ecdb7677 100644
--- a/cpp/src/text/edit_distance.cu
+++ b/cpp/src/text/edit_distance.cu
@@ -309,7 +309,7 @@ std::unique_ptr<cudf::column> edit_distance(cudf::strings_column_view const& str
                                             rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::edit_distance(strings, targets, cudf::default_stream_value, mr);
+  return detail::edit_distance(strings, targets, cudf::get_default_stream(), mr);
 }
 
 /**
@@ -319,7 +319,7 @@ std::unique_ptr<cudf::column> edit_distance_matrix(cudf::strings_column_view con
                                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::edit_distance_matrix(strings, cudf::default_stream_value, mr);
+  return detail::edit_distance_matrix(strings, cudf::get_default_stream(), mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/generate_ngrams.cu b/cpp/src/text/generate_ngrams.cu
index 7ffd2bd80a7..be50ece28d5 100644
--- a/cpp/src/text/generate_ngrams.cu
+++ b/cpp/src/text/generate_ngrams.cu
@@ -84,12 +84,11 @@ struct ngram_generator_fn {
 
 }  // namespace
 
-std::unique_ptr<cudf::column> generate_ngrams(
-  cudf::strings_column_view const& strings,
-  cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::cuda_stream_view stream         = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& strings,
+                                              cudf::size_type ngrams,
+                                              cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(separator.is_valid(stream), "Parameter separator must be valid");
   cudf::string_view const d_separator(separator.data(), separator.size());
@@ -151,7 +150,7 @@ std::unique_ptr<cudf::column> generate_ngrams(cudf::strings_column_view const& s
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_ngrams(strings, ngrams, separator, cudf::default_stream_value, mr);
+  return detail::generate_ngrams(strings, ngrams, separator, cudf::get_default_stream(), mr);
 }
 
 namespace detail {
@@ -261,7 +260,7 @@ std::unique_ptr<cudf::column> generate_character_ngrams(cudf::strings_column_vie
                                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::generate_character_ngrams(strings, ngrams, cudf::default_stream_value, mr);
+  return detail::generate_character_ngrams(strings, ngrams, cudf::get_default_stream(), mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/ngrams_tokenize.cu b/cpp/src/text/ngrams_tokenize.cu
index f353b79f720..f1ddcfdc6f8 100644
--- a/cpp/src/text/ngrams_tokenize.cu
+++ b/cpp/src/text/ngrams_tokenize.cu
@@ -134,13 +134,12 @@ struct ngram_builder_fn {
 
 // detail APIs
 
-std::unique_ptr<cudf::column> ngrams_tokenize(
-  cudf::strings_column_view const& strings,
-  cudf::size_type ngrams               = 2,
-  cudf::string_scalar const& delimiter = cudf::string_scalar(""),
-  cudf::string_scalar const& separator = cudf::string_scalar{"_"},
-  rmm::cuda_stream_view stream         = cudf::default_stream_value,
-  rmm::mr::device_memory_resource* mr  = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& strings,
+                                              cudf::size_type ngrams,
+                                              cudf::string_scalar const& delimiter,
+                                              cudf::string_scalar const& separator,
+                                              rmm::cuda_stream_view stream,
+                                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_EXPECTS(delimiter.is_valid(stream), "Parameter delimiter must be valid");
   cudf::string_view d_delimiter(delimiter.data(), delimiter.size());
@@ -263,7 +262,7 @@ std::unique_ptr<cudf::column> ngrams_tokenize(cudf::strings_column_view const& s
 {
   CUDF_FUNC_RANGE();
   return detail::ngrams_tokenize(
-    strings, ngrams, delimiter, separator, cudf::default_stream_value, mr);
+    strings, ngrams, delimiter, separator, cudf::get_default_stream(), mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/normalize.cu b/cpp/src/text/normalize.cu
index 48921ac6520..2931370ac02 100644
--- a/cpp/src/text/normalize.cu
+++ b/cpp/src/text/normalize.cu
@@ -170,10 +170,9 @@ struct codepoint_to_utf8_fn {
 }  // namespace
 
 // detail API
-std::unique_ptr<cudf::column> normalize_spaces(
-  cudf::strings_column_view const& strings,
-  rmm::cuda_stream_view stream,
-  rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
+std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const& strings,
+                                               rmm::cuda_stream_view stream,
+                                               rmm::mr::device_memory_resource* mr)
 {
   if (strings.is_empty()) return cudf::make_empty_column(cudf::data_type{cudf::type_id::STRING});
 
@@ -244,7 +243,7 @@ std::unique_ptr<cudf::column> normalize_spaces(cudf::strings_column_view const&
                                                rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_spaces(strings, cudf::default_stream_value, mr);
+  return detail::normalize_spaces(strings, cudf::get_default_stream(), mr);
 }
 
 /**
@@ -255,7 +254,7 @@ std::unique_ptr<cudf::column> normalize_characters(cudf::strings_column_view con
                                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::normalize_characters(strings, do_lower_case, cudf::default_stream_value, mr);
+  return detail::normalize_characters(strings, do_lower_case, cudf::get_default_stream(), mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/replace.cu b/cpp/src/text/replace.cu
index 9171df97800..87c1d345ff5 100644
--- a/cpp/src/text/replace.cu
+++ b/cpp/src/text/replace.cu
@@ -282,7 +282,7 @@ std::unique_ptr<cudf::column> replace_tokens(cudf::strings_column_view const& st
 {
   CUDF_FUNC_RANGE();
   return detail::replace_tokens(
-    strings, targets, replacements, delimiter, cudf::default_stream_value, mr);
+    strings, targets, replacements, delimiter, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& strings,
@@ -293,7 +293,7 @@ std::unique_ptr<cudf::column> filter_tokens(cudf::strings_column_view const& str
 {
   CUDF_FUNC_RANGE();
   return detail::filter_tokens(
-    strings, min_token_length, replacement, delimiter, cudf::default_stream_value, mr);
+    strings, min_token_length, replacement, delimiter, cudf::get_default_stream(), mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/stemmer.cu b/cpp/src/text/stemmer.cu
index cdf87967a0d..780ca5b4e5d 100644
--- a/cpp/src/text/stemmer.cu
+++ b/cpp/src/text/stemmer.cu
@@ -254,7 +254,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
   return detail::is_letter(strings,
                            ltype,
                            thrust::make_constant_iterator<cudf::size_type>(character_index),
-                           cudf::default_stream_value,
+                           cudf::get_default_stream(),
                            mr);
 }
 
@@ -264,7 +264,7 @@ std::unique_ptr<cudf::column> is_letter(cudf::strings_column_view const& strings
                                         rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_letter(strings, ltype, indices, cudf::default_stream_value, mr);
+  return detail::is_letter(strings, ltype, indices, cudf::get_default_stream(), mr);
 }
 
 /**
@@ -274,7 +274,7 @@ std::unique_ptr<cudf::column> porter_stemmer_measure(cudf::strings_column_view c
                                                      rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::porter_stemmer_measure(strings, cudf::default_stream_value, mr);
+  return detail::porter_stemmer_measure(strings, cudf::get_default_stream(), mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/bpe_tokenizer.cu b/cpp/src/text/subword/bpe_tokenizer.cu
index 549704bcbe4..ba07d70fea3 100644
--- a/cpp/src/text/subword/bpe_tokenizer.cu
+++ b/cpp/src/text/subword/bpe_tokenizer.cu
@@ -565,7 +565,7 @@ std::unique_ptr<cudf::column> byte_pair_encoding(cudf::strings_column_view const
                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::byte_pair_encoding(input, merges_table, separator, cudf::default_stream_value, mr);
+  return detail::byte_pair_encoding(input, merges_table, separator, cudf::get_default_stream(), mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/detail/codepoint_metadata.ah b/cpp/src/text/subword/detail/codepoint_metadata.ah
index bc56d6c4ba5..794d14e4b6c 100644
--- a/cpp/src/text/subword/detail/codepoint_metadata.ah
+++ b/cpp/src/text/subword/detail/codepoint_metadata.ah
@@ -13464,4 +13464,3 @@ uint64_t aux_cp_data_119134_119232[] = {
 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
 0,0,0,0,0,0,0,0,0,0,511706698612736,511706698612736,511706698731886,511706698731886,511706698731887,511706698731887
 };
-
diff --git a/cpp/src/text/subword/load_hash_file.cu b/cpp/src/text/subword/load_hash_file.cu
index 1b84cfd49fa..b52597fff47 100644
--- a/cpp/src/text/subword/load_hash_file.cu
+++ b/cpp/src/text/subword/load_hash_file.cu
@@ -284,7 +284,7 @@ std::unique_ptr<hashed_vocabulary> load_vocabulary_file(
   std::string const& filename_hashed_vocabulary, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::load_vocabulary_file(filename_hashed_vocabulary, cudf::default_stream_value, mr);
+  return detail::load_vocabulary_file(filename_hashed_vocabulary, cudf::get_default_stream(), mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/text/subword/load_merges_file.cu b/cpp/src/text/subword/load_merges_file.cu
index da0598ddfac..518a860e39a 100644
--- a/cpp/src/text/subword/load_merges_file.cu
+++ b/cpp/src/text/subword/load_merges_file.cu
@@ -159,7 +159,7 @@ std::unique_ptr<bpe_merge_pairs> load_merge_pairs_file(std::string const& filena
                                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::load_merge_pairs_file(filename_merges, cudf::default_stream_value, mr);
+  return detail::load_merge_pairs_file(filename_merges, cudf::get_default_stream(), mr);
 }
 
 bpe_merge_pairs::bpe_merge_pairs_impl::bpe_merge_pairs_impl(
diff --git a/cpp/src/text/subword/subword_tokenize.cu b/cpp/src/text/subword/subword_tokenize.cu
index 7bd941f5823..844f2a625e0 100644
--- a/cpp/src/text/subword/subword_tokenize.cu
+++ b/cpp/src/text/subword/subword_tokenize.cu
@@ -270,7 +270,7 @@ tokenizer_result subword_tokenize(cudf::strings_column_view const& strings,
                                   do_lower_case,
                                   do_truncate,
                                   max_rows_tensor,
-                                  cudf::default_stream_value,
+                                  cudf::get_default_stream(),
                                   mr);
 }
 
diff --git a/cpp/src/text/tokenize.cu b/cpp/src/text/tokenize.cu
index 9da28af13c2..4ffd1b08998 100644
--- a/cpp/src/text/tokenize.cu
+++ b/cpp/src/text/tokenize.cu
@@ -232,7 +232,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tokenize(strings, delimiter, cudf::default_stream_value, mr);
+  return detail::tokenize(strings, delimiter, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
@@ -240,7 +240,7 @@ std::unique_ptr<cudf::column> tokenize(cudf::strings_column_view const& strings,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::tokenize(strings, delimiters, cudf::default_stream_value, mr);
+  return detail::tokenize(strings, delimiters, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
@@ -248,7 +248,7 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_tokens(strings, delimiter, cudf::default_stream_value, mr);
+  return detail::count_tokens(strings, delimiter, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& strings,
@@ -256,14 +256,14 @@ std::unique_ptr<cudf::column> count_tokens(cudf::strings_column_view const& stri
                                            rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::count_tokens(strings, delimiters, cudf::default_stream_value, mr);
+  return detail::count_tokens(strings, delimiters, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<cudf::column> character_tokenize(cudf::strings_column_view const& strings,
                                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::character_tokenize(strings, cudf::default_stream_value, mr);
+  return detail::character_tokenize(strings, cudf::get_default_stream(), mr);
 }
 
 }  // namespace nvtext
diff --git a/cpp/src/transform/bools_to_mask.cu b/cpp/src/transform/bools_to_mask.cu
index 70ead43e15b..e558b51fbb0 100644
--- a/cpp/src/transform/bools_to_mask.cu
+++ b/cpp/src/transform/bools_to_mask.cu
@@ -61,7 +61,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> bools_to_mask(
   column_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::bools_to_mask(input, cudf::default_stream_value, mr);
+  return detail::bools_to_mask(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/compute_column.cu b/cpp/src/transform/compute_column.cu
index 9d9f1d3d8d2..e11ff437c14 100644
--- a/cpp/src/transform/compute_column.cu
+++ b/cpp/src/transform/compute_column.cu
@@ -137,7 +137,7 @@ std::unique_ptr<column> compute_column(table_view const& table,
                                        rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::compute_column(table, expr, cudf::default_stream_value, mr);
+  return detail::compute_column(table, expr, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/encode.cu b/cpp/src/transform/encode.cu
index 75e3fae6e78..c0e0c83c416 100644
--- a/cpp/src/transform/encode.cu
+++ b/cpp/src/transform/encode.cu
@@ -73,7 +73,7 @@ std::pair<std::unique_ptr<cudf::table>, std::unique_ptr<cudf::column>> encode(
   cudf::table_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::encode(input, cudf::default_stream_value, mr);
+  return detail::encode(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/mask_to_bools.cu b/cpp/src/transform/mask_to_bools.cu
index 23bfe1f24f1..1b9a58c4724 100644
--- a/cpp/src/transform/mask_to_bools.cu
+++ b/cpp/src/transform/mask_to_bools.cu
@@ -64,6 +64,6 @@ std::unique_ptr<column> mask_to_bools(bitmask_type const* bitmask,
                                       rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::mask_to_bools(bitmask, begin_bit, end_bit, cudf::default_stream_value, mr);
+  return detail::mask_to_bools(bitmask, begin_bit, end_bit, cudf::get_default_stream(), mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/transform/nans_to_nulls.cu b/cpp/src/transform/nans_to_nulls.cu
index d840832af88..3c02409f778 100644
--- a/cpp/src/transform/nans_to_nulls.cu
+++ b/cpp/src/transform/nans_to_nulls.cu
@@ -95,7 +95,7 @@ std::pair<std::unique_ptr<rmm::device_buffer>, cudf::size_type> nans_to_nulls(
   column_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::nans_to_nulls(input, cudf::default_stream_value, mr);
+  return detail::nans_to_nulls(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/one_hot_encode.cu b/cpp/src/transform/one_hot_encode.cu
index 94cf86f6829..9ccd21f5898 100644
--- a/cpp/src/transform/one_hot_encode.cu
+++ b/cpp/src/transform/one_hot_encode.cu
@@ -127,6 +127,6 @@ std::pair<std::unique_ptr<column>, table_view> one_hot_encode(column_view const&
                                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::one_hot_encode(input, categories, cudf::default_stream_value, mr);
+  return detail::one_hot_encode(input, categories, cudf::get_default_stream(), mr);
 }
 }  // namespace cudf
diff --git a/cpp/src/transform/row_bit_count.cu b/cpp/src/transform/row_bit_count.cu
index 9545b5289f9..634fdd70831 100644
--- a/cpp/src/transform/row_bit_count.cu
+++ b/cpp/src/transform/row_bit_count.cu
@@ -539,7 +539,7 @@ std::unique_ptr<column> row_bit_count(table_view const& t,
 std::unique_ptr<column> row_bit_count(table_view const& t, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::row_bit_count(t, cudf::default_stream_value, mr);
+  return detail::row_bit_count(t, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transform/transform.cpp b/cpp/src/transform/transform.cpp
index 9de17f22b50..5e6646aa48f 100644
--- a/cpp/src/transform/transform.cpp
+++ b/cpp/src/transform/transform.cpp
@@ -99,7 +99,7 @@ std::unique_ptr<column> transform(column_view const& input,
                                   rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::transform(input, unary_udf, output_type, is_ptx, cudf::default_stream_value, mr);
+  return detail::transform(input, unary_udf, output_type, is_ptx, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/transpose/transpose.cu b/cpp/src/transpose/transpose.cu
index 5592e298fa3..94ede5d3c65 100644
--- a/cpp/src/transpose/transpose.cu
+++ b/cpp/src/transpose/transpose.cu
@@ -63,7 +63,7 @@ std::pair<std::unique_ptr<column>, table_view> transpose(table_view const& input
                                                          rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::transpose(input, cudf::default_stream_value, mr);
+  return detail::transpose(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/cast_ops.cu b/cpp/src/unary/cast_ops.cu
index 17c47d8fc90..b569ce04c31 100644
--- a/cpp/src/unary/cast_ops.cu
+++ b/cpp/src/unary/cast_ops.cu
@@ -412,7 +412,7 @@ std::unique_ptr<column> cast(column_view const& input,
                              rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::cast(input, type, cudf::default_stream_value, mr);
+  return detail::cast(input, type, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/math_ops.cu b/cpp/src/unary/math_ops.cu
index 448ac01babb..961f3a9e720 100644
--- a/cpp/src/unary/math_ops.cu
+++ b/cpp/src/unary/math_ops.cu
@@ -641,7 +641,7 @@ std::unique_ptr<cudf::column> unary_operation(cudf::column_view const& input,
                                               rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::unary_operation(input, op, cudf::default_stream_value, mr);
+  return detail::unary_operation(input, op, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/nan_ops.cu b/cpp/src/unary/nan_ops.cu
index 5cac9c51e4e..2cf83466b03 100644
--- a/cpp/src/unary/nan_ops.cu
+++ b/cpp/src/unary/nan_ops.cu
@@ -94,14 +94,14 @@ std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
 std::unique_ptr<column> is_nan(cudf::column_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_nan(input, cudf::default_stream_value, mr);
+  return detail::is_nan(input, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> is_not_nan(cudf::column_view const& input,
                                    rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_not_nan(input, cudf::default_stream_value, mr);
+  return detail::is_not_nan(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/null_ops.cu b/cpp/src/unary/null_ops.cu
index 04bb1fe63e3..e64c68fdae6 100644
--- a/cpp/src/unary/null_ops.cu
+++ b/cpp/src/unary/null_ops.cu
@@ -58,14 +58,14 @@ std::unique_ptr<column> is_valid(cudf::column_view const& input,
 std::unique_ptr<column> is_null(cudf::column_view const& input, rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_null(input, cudf::default_stream_value, mr);
+  return detail::is_null(input, cudf::get_default_stream(), mr);
 }
 
 std::unique_ptr<column> is_valid(cudf::column_view const& input,
                                  rmm::mr::device_memory_resource* mr)
 {
   CUDF_FUNC_RANGE();
-  return detail::is_valid(input, cudf::default_stream_value, mr);
+  return detail::is_valid(input, cudf::get_default_stream(), mr);
 }
 
 }  // namespace cudf
diff --git a/cpp/src/unary/unary_ops.cuh b/cpp/src/unary/unary_ops.cuh
index 08b68cc0591..d0003bb6b41 100644
--- a/cpp/src/unary/unary_ops.cuh
+++ b/cpp/src/unary/unary_ops.cuh
@@ -14,8 +14,7 @@
  * limitations under the License.
  */
 
-#ifndef UNARY_OPS_H
-#define UNARY_OPS_H
+#pragma once
 
 #include <cudf/copying.hpp>
 #include <cudf/detail/copy.hpp>
@@ -78,5 +77,3 @@ struct launcher {
 
 }  // namespace unary
 }  // namespace cudf
-
-#endif  // UNARY_OPS_H
diff --git a/cpp/src/utilities/default_stream.cpp b/cpp/src/utilities/default_stream.cpp
index d580972bc97..c21436abdb9 100644
--- a/cpp/src/utilities/default_stream.cpp
+++ b/cpp/src/utilities/default_stream.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -18,6 +18,16 @@
 
 namespace cudf {
 
+namespace detail {
+
+#if defined(CUDF_USE_PER_THREAD_DEFAULT_STREAM)
+rmm::cuda_stream_view const default_stream_value{rmm::cuda_stream_per_thread};
+#else
+rmm::cuda_stream_view const default_stream_value{};
+#endif
+
+}  // namespace detail
+
 /**
  * @brief Check if per-thread default stream is enabled.
  *
@@ -32,4 +42,5 @@ bool is_ptds_enabled()
 #endif
 }
 
+rmm::cuda_stream_view const get_default_stream() { return detail::default_stream_value; }
 }  // namespace cudf
diff --git a/cpp/src/utilities/traits.cpp b/cpp/src/utilities/traits.cpp
index 14493a45cd7..bc10dd7845a 100644
--- a/cpp/src/utilities/traits.cpp
+++ b/cpp/src/utilities/traits.cpp
@@ -148,6 +148,16 @@ struct is_unsigned_impl {
  */
 bool is_unsigned(data_type type) { return cudf::type_dispatcher(type, is_unsigned_impl{}); }
 
+struct is_integral_impl {
+  template <typename T>
+  constexpr bool operator()()
+  {
+    return is_integral<T>();
+  }
+};
+
+bool is_integral(data_type type) { return cudf::type_dispatcher(type, is_integral_impl{}); }
+
 struct is_floating_point_impl {
   template <typename T>
   constexpr bool operator()()
diff --git a/cpp/tests/CMakeLists.txt b/cpp/tests/CMakeLists.txt
index e630e842f4e..bdf74368ffe 100644
--- a/cpp/tests/CMakeLists.txt
+++ b/cpp/tests/CMakeLists.txt
@@ -146,7 +146,7 @@ ConfigureTest(HASH_MAP_TEST hash_map/map_test.cu)
 # ##################################################################################################
 # * quantiles tests -------------------------------------------------------------------------------
 ConfigureTest(
-  QUANTILES_TEST quantiles/percentile_approx_test.cu quantiles/quantile_test.cpp
+  QUANTILES_TEST quantiles/percentile_approx_test.cpp quantiles/quantile_test.cpp
   quantiles/quantiles_test.cpp
 )
 
@@ -223,13 +223,14 @@ ConfigureTest(DECOMPRESSION_TEST io/comp/decomp_test.cpp)
 ConfigureTest(CSV_TEST io/csv_test.cpp)
 ConfigureTest(FILE_IO_TEST io/file_io_test.cpp)
 ConfigureTest(ORC_TEST io/orc_test.cpp)
-ConfigureTest(PARQUET_TEST io/parquet_test.cpp)
-ConfigureTest(JSON_TEST io/json_test.cpp)
+ConfigureTest(PARQUET_TEST io/parquet_test.cpp io/parquet_chunked_reader_test.cpp)
+ConfigureTest(JSON_TEST io/json_test.cpp io/json_chunked_reader.cpp)
 ConfigureTest(JSON_TYPE_CAST_TEST io/json_type_cast_test.cu)
 ConfigureTest(NESTED_JSON_TEST io/nested_json_test.cpp io/json_tree.cpp)
 ConfigureTest(ARROW_IO_SOURCE_TEST io/arrow_io_source_test.cpp)
 ConfigureTest(MULTIBYTE_SPLIT_TEST io/text/multibyte_split_test.cpp)
 ConfigureTest(DATA_CHUNK_SOURCE_TEST io/text/data_chunk_source_test.cpp)
+target_link_libraries(DATA_CHUNK_SOURCE_TEST PRIVATE ZLIB::ZLIB)
 ConfigureTest(LOGICAL_STACK_TEST io/fst/logical_stack_test.cu)
 ConfigureTest(FST_TEST io/fst/fst_test.cu)
 ConfigureTest(TYPE_INFERENCE_TEST io/type_inference_test.cu)
diff --git a/cpp/tests/bitmask/bitmask_tests.cpp b/cpp/tests/bitmask/bitmask_tests.cpp
index 048c6f9dfa2..f180ff64115 100644
--- a/cpp/tests/bitmask/bitmask_tests.cpp
+++ b/cpp/tests/bitmask/bitmask_tests.cpp
@@ -69,15 +69,15 @@ struct CountBitmaskTest : public cudf::test::BaseFixture {
 
 TEST_F(CountBitmaskTest, NullMask)
 {
-  EXPECT_THROW(cudf::detail::count_set_bits(nullptr, 0, 32, cudf::default_stream_value),
+  EXPECT_THROW(cudf::detail::count_set_bits(nullptr, 0, 32, cudf::get_default_stream()),
                cudf::logic_error);
-  EXPECT_EQ(32, cudf::detail::valid_count(nullptr, 0, 32, cudf::default_stream_value));
+  EXPECT_EQ(32, cudf::detail::valid_count(nullptr, 0, 32, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {0, 32, 7, 25};
-  EXPECT_THROW(cudf::detail::segmented_count_set_bits(nullptr, indices, cudf::default_stream_value),
+  EXPECT_THROW(cudf::detail::segmented_count_set_bits(nullptr, indices, cudf::get_default_stream()),
                cudf::logic_error);
   auto valid_counts =
-    cudf::detail::segmented_valid_count(nullptr, indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(nullptr, indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{32, 18}));
 }
 
@@ -86,13 +86,14 @@ TEST_F(CountBitmaskTest, NullMask)
 rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fill_valid = false)
 {
   if (!fill_valid) {
-    return cudf::detail::make_zeroed_device_uvector_sync<cudf::bitmask_type>(size);
+    return cudf::detail::make_zeroed_device_uvector_sync<cudf::bitmask_type>(
+      size, cudf::get_default_stream());
   } else {
-    auto ret = rmm::device_uvector<cudf::bitmask_type>(size, cudf::default_stream_value);
+    auto ret = rmm::device_uvector<cudf::bitmask_type>(size, cudf::get_default_stream());
     CUDF_CUDA_TRY(cudaMemsetAsync(ret.data(),
                                   ~cudf::bitmask_type{0},
                                   size * sizeof(cudf::bitmask_type),
-                                  cudf::default_stream_value.value()));
+                                  cudf::get_default_stream().value()));
     return ret;
   }
 }
@@ -100,244 +101,244 @@ rmm::device_uvector<cudf::bitmask_type> make_mask(cudf::size_type size, bool fil
 TEST_F(CountBitmaskTest, NegativeStart)
 {
   auto mask = make_mask(1);
-  EXPECT_THROW(cudf::detail::count_set_bits(mask.data(), -1, 32, cudf::default_stream_value),
+  EXPECT_THROW(cudf::detail::count_set_bits(mask.data(), -1, 32, cudf::get_default_stream()),
                cudf::logic_error);
-  EXPECT_THROW(cudf::detail::valid_count(mask.data(), -1, 32, cudf::default_stream_value),
+  EXPECT_THROW(cudf::detail::valid_count(mask.data(), -1, 32, cudf::get_default_stream()),
                cudf::logic_error);
 
   std::vector<cudf::size_type> indices = {0, 16, -1, 32};
   EXPECT_THROW(
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value),
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream()),
     cudf::logic_error);
   EXPECT_THROW(
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value),
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream()),
     cudf::logic_error);
 }
 
 TEST_F(CountBitmaskTest, StartLargerThanStop)
 {
   auto mask = make_mask(1);
-  EXPECT_THROW(cudf::detail::count_set_bits(mask.data(), 32, 31, cudf::default_stream_value),
+  EXPECT_THROW(cudf::detail::count_set_bits(mask.data(), 32, 31, cudf::get_default_stream()),
                cudf::logic_error);
-  EXPECT_THROW(cudf::detail::valid_count(mask.data(), 32, 31, cudf::default_stream_value),
+  EXPECT_THROW(cudf::detail::valid_count(mask.data(), 32, 31, cudf::get_default_stream()),
                cudf::logic_error);
 
   std::vector<cudf::size_type> indices = {0, 16, 31, 30};
   EXPECT_THROW(
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value),
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream()),
     cudf::logic_error);
   EXPECT_THROW(
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value),
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream()),
     cudf::logic_error);
 }
 
 TEST_F(CountBitmaskTest, EmptyRange)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 17, 17, cudf::default_stream_value));
-  EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 17, 17, cudf::default_stream_value));
+  EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 17, 17, cudf::get_default_stream()));
+  EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 17, 17, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {0, 0, 17, 17};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordAllZero)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 0, 32, cudf::default_stream_value));
-  EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 0, 32, cudf::default_stream_value));
+  EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 0, 32, cudf::get_default_stream()));
+  EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 0, 32, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {0, 32, 0, 32};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
   auto valid_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountBitmaskTest, SingleBitAllZero)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 17, 18, cudf::default_stream_value));
-  EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 17, 18, cudf::default_stream_value));
+  EXPECT_EQ(0, cudf::detail::count_set_bits(mask.data(), 17, 18, cudf::get_default_stream()));
+  EXPECT_EQ(0, cudf::detail::valid_count(mask.data(), 17, 18, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {17, 18, 7, 8};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountBitmaskTest, SingleBitAllSet)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(1, cudf::detail::count_set_bits(mask.data(), 13, 14, cudf::default_stream_value));
-  EXPECT_EQ(1, cudf::detail::valid_count(mask.data(), 13, 14, cudf::default_stream_value));
+  EXPECT_EQ(1, cudf::detail::count_set_bits(mask.data(), 13, 14, cudf::get_default_stream()));
+  EXPECT_EQ(1, cudf::detail::valid_count(mask.data(), 13, 14, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {13, 14, 0, 1};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordAllBitsSet)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(32, cudf::detail::count_set_bits(mask.data(), 0, 32, cudf::default_stream_value));
-  EXPECT_EQ(32, cudf::detail::valid_count(mask.data(), 0, 32, cudf::default_stream_value));
+  EXPECT_EQ(32, cudf::detail::count_set_bits(mask.data(), 0, 32, cudf::get_default_stream()));
+  EXPECT_EQ(32, cudf::detail::valid_count(mask.data(), 0, 32, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {0, 32, 0, 32};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{32, 32}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{32, 32}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordPreSlack)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(25, cudf::detail::count_set_bits(mask.data(), 7, 32, cudf::default_stream_value));
-  EXPECT_EQ(25, cudf::detail::valid_count(mask.data(), 7, 32, cudf::default_stream_value));
+  EXPECT_EQ(25, cudf::detail::count_set_bits(mask.data(), 7, 32, cudf::get_default_stream()));
+  EXPECT_EQ(25, cudf::detail::valid_count(mask.data(), 7, 32, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {7, 32, 8, 32};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{25, 24}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{25, 24}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordPostSlack)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(17, cudf::detail::count_set_bits(mask.data(), 0, 17, cudf::default_stream_value));
-  EXPECT_EQ(17, cudf::detail::valid_count(mask.data(), 0, 17, cudf::default_stream_value));
+  EXPECT_EQ(17, cudf::detail::count_set_bits(mask.data(), 0, 17, cudf::get_default_stream()));
+  EXPECT_EQ(17, cudf::detail::valid_count(mask.data(), 0, 17, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {0, 17, 0, 18};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{17, 18}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{17, 18}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordSubset)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(30, cudf::detail::count_set_bits(mask.data(), 1, 31, cudf::default_stream_value));
-  EXPECT_EQ(30, cudf::detail::valid_count(mask.data(), 1, 31, cudf::default_stream_value));
+  EXPECT_EQ(30, cudf::detail::count_set_bits(mask.data(), 1, 31, cudf::get_default_stream()));
+  EXPECT_EQ(30, cudf::detail::valid_count(mask.data(), 1, 31, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {1, 31, 7, 17};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{30, 10}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{30, 10}));
 }
 
 TEST_F(CountBitmaskTest, SingleWordSubset2)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(28, cudf::detail::count_set_bits(mask.data(), 2, 30, cudf::default_stream_value));
-  EXPECT_EQ(28, cudf::detail::valid_count(mask.data(), 2, 30, cudf::default_stream_value));
+  EXPECT_EQ(28, cudf::detail::count_set_bits(mask.data(), 2, 30, cudf::get_default_stream()));
+  EXPECT_EQ(28, cudf::detail::valid_count(mask.data(), 2, 30, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {4, 16, 2, 30};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{12, 28}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{12, 28}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsAllBits)
 {
   auto mask = make_mask(10, true);
-  EXPECT_EQ(320, cudf::detail::count_set_bits(mask.data(), 0, 320, cudf::default_stream_value));
-  EXPECT_EQ(320, cudf::detail::valid_count(mask.data(), 0, 320, cudf::default_stream_value));
+  EXPECT_EQ(320, cudf::detail::count_set_bits(mask.data(), 0, 320, cudf::get_default_stream()));
+  EXPECT_EQ(320, cudf::detail::valid_count(mask.data(), 0, 320, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {0, 320, 0, 320};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{320, 320}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{320, 320}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSubsetWordBoundary)
 {
   auto mask = make_mask(10, true);
-  EXPECT_EQ(256, cudf::detail::count_set_bits(mask.data(), 32, 288, cudf::default_stream_value));
-  EXPECT_EQ(256, cudf::detail::valid_count(mask.data(), 32, 288, cudf::default_stream_value));
+  EXPECT_EQ(256, cudf::detail::count_set_bits(mask.data(), 32, 288, cudf::get_default_stream()));
+  EXPECT_EQ(256, cudf::detail::valid_count(mask.data(), 32, 288, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {32, 192, 32, 288};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{160, 256}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{160, 256}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSplitWordBoundary)
 {
   auto mask = make_mask(10, true);
-  EXPECT_EQ(2, cudf::detail::count_set_bits(mask.data(), 31, 33, cudf::default_stream_value));
-  EXPECT_EQ(2, cudf::detail::valid_count(mask.data(), 31, 33, cudf::default_stream_value));
+  EXPECT_EQ(2, cudf::detail::count_set_bits(mask.data(), 31, 33, cudf::get_default_stream()));
+  EXPECT_EQ(2, cudf::detail::valid_count(mask.data(), 31, 33, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {31, 33, 60, 67};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{2, 7}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{2, 7}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSubset)
 {
   auto mask = make_mask(10, true);
-  EXPECT_EQ(226, cudf::detail::count_set_bits(mask.data(), 67, 293, cudf::default_stream_value));
-  EXPECT_EQ(226, cudf::detail::valid_count(mask.data(), 67, 293, cudf::default_stream_value));
+  EXPECT_EQ(226, cudf::detail::count_set_bits(mask.data(), 67, 293, cudf::get_default_stream()));
+  EXPECT_EQ(226, cudf::detail::valid_count(mask.data(), 67, 293, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {67, 293, 37, 319};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{226, 282}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{226, 282}));
 }
 
 TEST_F(CountBitmaskTest, MultipleWordsSingleBit)
 {
   auto mask = make_mask(10, true);
-  EXPECT_EQ(1, cudf::detail::count_set_bits(mask.data(), 67, 68, cudf::default_stream_value));
-  EXPECT_EQ(1, cudf::detail::valid_count(mask.data(), 67, 68, cudf::default_stream_value));
+  EXPECT_EQ(1, cudf::detail::count_set_bits(mask.data(), 67, 68, cudf::get_default_stream()));
+  EXPECT_EQ(1, cudf::detail::valid_count(mask.data(), 67, 68, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {67, 68, 31, 32, 192, 193};
   auto set_counts =
-    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_set_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(set_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1, 1}));
   auto valid_counts =
-    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_valid_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(valid_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1, 1}));
 }
 
@@ -346,180 +347,180 @@ using CountUnsetBitsTest = CountBitmaskTest;
 TEST_F(CountUnsetBitsTest, SingleBitAllSet)
 {
   auto mask = make_mask(1, true);
-  EXPECT_EQ(0, cudf::detail::count_unset_bits(mask.data(), 13, 14, cudf::default_stream_value));
-  EXPECT_EQ(0, cudf::detail::null_count(mask.data(), 13, 14, cudf::default_stream_value));
+  EXPECT_EQ(0, cudf::detail::count_unset_bits(mask.data(), 13, 14, cudf::get_default_stream()));
+  EXPECT_EQ(0, cudf::detail::null_count(mask.data(), 13, 14, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {13, 14, 31, 32};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
   auto null_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountUnsetBitsTest, NullMask)
 {
-  EXPECT_THROW(cudf::detail::count_unset_bits(nullptr, 0, 32, cudf::default_stream_value),
+  EXPECT_THROW(cudf::detail::count_unset_bits(nullptr, 0, 32, cudf::get_default_stream()),
                cudf::logic_error);
-  EXPECT_EQ(0, cudf::detail::null_count(nullptr, 0, 32, cudf::default_stream_value));
+  EXPECT_EQ(0, cudf::detail::null_count(nullptr, 0, 32, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {0, 32, 7, 25};
   EXPECT_THROW(
-    cudf::detail::segmented_count_unset_bits(nullptr, indices, cudf::default_stream_value),
+    cudf::detail::segmented_count_unset_bits(nullptr, indices, cudf::get_default_stream()),
     cudf::logic_error);
   auto null_counts =
-    cudf::detail::segmented_null_count(nullptr, indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(nullptr, indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{0, 0}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordAllBits)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(32, cudf::detail::count_unset_bits(mask.data(), 0, 32, cudf::default_stream_value));
-  EXPECT_EQ(32, cudf::detail::null_count(mask.data(), 0, 32, cudf::default_stream_value));
+  EXPECT_EQ(32, cudf::detail::count_unset_bits(mask.data(), 0, 32, cudf::get_default_stream()));
+  EXPECT_EQ(32, cudf::detail::null_count(mask.data(), 0, 32, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {0, 32, 0, 32};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{32, 32}));
   auto null_counts =
-    cudf::detail::segmented_null_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{32, 32}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordPreSlack)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(25, cudf::detail::count_unset_bits(mask.data(), 7, 32, cudf::default_stream_value));
-  EXPECT_EQ(25, cudf::detail::null_count(mask.data(), 7, 32, cudf::default_stream_value));
+  EXPECT_EQ(25, cudf::detail::count_unset_bits(mask.data(), 7, 32, cudf::get_default_stream()));
+  EXPECT_EQ(25, cudf::detail::null_count(mask.data(), 7, 32, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {7, 32, 8, 32};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{25, 24}));
   auto null_counts =
-    cudf::detail::segmented_null_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{25, 24}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordPostSlack)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(17, cudf::detail::count_unset_bits(mask.data(), 0, 17, cudf::default_stream_value));
-  EXPECT_EQ(17, cudf::detail::null_count(mask.data(), 0, 17, cudf::default_stream_value));
+  EXPECT_EQ(17, cudf::detail::count_unset_bits(mask.data(), 0, 17, cudf::get_default_stream()));
+  EXPECT_EQ(17, cudf::detail::null_count(mask.data(), 0, 17, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {0, 17, 0, 18};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{17, 18}));
   auto null_counts =
-    cudf::detail::segmented_null_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{17, 18}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordSubset)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(30, cudf::detail::count_unset_bits(mask.data(), 1, 31, cudf::default_stream_value));
-  EXPECT_EQ(30, cudf::detail::null_count(mask.data(), 1, 31, cudf::default_stream_value));
+  EXPECT_EQ(30, cudf::detail::count_unset_bits(mask.data(), 1, 31, cudf::get_default_stream()));
+  EXPECT_EQ(30, cudf::detail::null_count(mask.data(), 1, 31, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {1, 31, 7, 17};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{30, 10}));
   auto null_counts =
-    cudf::detail::segmented_null_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{30, 10}));
 }
 
 TEST_F(CountUnsetBitsTest, SingleWordSubset2)
 {
   auto mask = make_mask(1);
-  EXPECT_EQ(28, cudf::detail::count_unset_bits(mask.data(), 2, 30, cudf::default_stream_value));
-  EXPECT_EQ(28, cudf::detail::null_count(mask.data(), 2, 30, cudf::default_stream_value));
+  EXPECT_EQ(28, cudf::detail::count_unset_bits(mask.data(), 2, 30, cudf::get_default_stream()));
+  EXPECT_EQ(28, cudf::detail::null_count(mask.data(), 2, 30, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {4, 16, 2, 30};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{12, 28}));
   auto null_counts =
-    cudf::detail::segmented_null_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{12, 28}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsAllBits)
 {
   auto mask = make_mask(10);
-  EXPECT_EQ(320, cudf::detail::count_unset_bits(mask.data(), 0, 320, cudf::default_stream_value));
-  EXPECT_EQ(320, cudf::detail::null_count(mask.data(), 0, 320, cudf::default_stream_value));
+  EXPECT_EQ(320, cudf::detail::count_unset_bits(mask.data(), 0, 320, cudf::get_default_stream()));
+  EXPECT_EQ(320, cudf::detail::null_count(mask.data(), 0, 320, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {0, 320, 0, 320};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{320, 320}));
   auto null_counts =
-    cudf::detail::segmented_null_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{320, 320}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSubsetWordBoundary)
 {
   auto mask = make_mask(10);
-  EXPECT_EQ(256, cudf::detail::count_unset_bits(mask.data(), 32, 288, cudf::default_stream_value));
-  EXPECT_EQ(256, cudf::detail::null_count(mask.data(), 32, 288, cudf::default_stream_value));
+  EXPECT_EQ(256, cudf::detail::count_unset_bits(mask.data(), 32, 288, cudf::get_default_stream()));
+  EXPECT_EQ(256, cudf::detail::null_count(mask.data(), 32, 288, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {32, 192, 32, 288};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{160, 256}));
   auto null_counts =
-    cudf::detail::segmented_null_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{160, 256}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSplitWordBoundary)
 {
   auto mask = make_mask(10);
-  EXPECT_EQ(2, cudf::detail::count_unset_bits(mask.data(), 31, 33, cudf::default_stream_value));
-  EXPECT_EQ(2, cudf::detail::null_count(mask.data(), 31, 33, cudf::default_stream_value));
+  EXPECT_EQ(2, cudf::detail::count_unset_bits(mask.data(), 31, 33, cudf::get_default_stream()));
+  EXPECT_EQ(2, cudf::detail::null_count(mask.data(), 31, 33, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {31, 33, 60, 67};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{2, 7}));
   auto null_counts =
-    cudf::detail::segmented_null_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{2, 7}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSubset)
 {
   auto mask = make_mask(10);
-  EXPECT_EQ(226, cudf::detail::count_unset_bits(mask.data(), 67, 293, cudf::default_stream_value));
-  EXPECT_EQ(226, cudf::detail::null_count(mask.data(), 67, 293, cudf::default_stream_value));
+  EXPECT_EQ(226, cudf::detail::count_unset_bits(mask.data(), 67, 293, cudf::get_default_stream()));
+  EXPECT_EQ(226, cudf::detail::null_count(mask.data(), 67, 293, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {67, 293, 37, 319};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{226, 282}));
   auto null_counts =
-    cudf::detail::segmented_null_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{226, 282}));
 }
 
 TEST_F(CountUnsetBitsTest, MultipleWordsSingleBit)
 {
   auto mask = make_mask(10);
-  EXPECT_EQ(1, cudf::detail::count_unset_bits(mask.data(), 67, 68, cudf::default_stream_value));
-  EXPECT_EQ(1, cudf::detail::null_count(mask.data(), 67, 68, cudf::default_stream_value));
+  EXPECT_EQ(1, cudf::detail::count_unset_bits(mask.data(), 67, 68, cudf::get_default_stream()));
+  EXPECT_EQ(1, cudf::detail::null_count(mask.data(), 67, 68, cudf::get_default_stream()));
 
   std::vector<cudf::size_type> indices = {67, 68, 31, 32, 192, 193};
   auto unset_counts =
-    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_count_unset_bits(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(unset_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1, 1}));
   auto null_counts =
-    cudf::detail::segmented_null_count(mask.data(), indices, cudf::default_stream_value);
+    cudf::detail::segmented_null_count(mask.data(), indices, cudf::get_default_stream());
   EXPECT_THAT(null_counts, ::testing::ElementsAreArray(std::vector<cudf::size_type>{1, 1, 1}));
 }
 
@@ -622,10 +623,10 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorContiguous)
   }
   auto gold_mask = cudf::test::detail::make_null_mask(validity_bit.begin(), validity_bit.end());
 
-  rmm::device_buffer copy_mask{gold_mask, cudf::default_stream_value};
+  rmm::device_buffer copy_mask{gold_mask, cudf::get_default_stream()};
   cudf::column original{t,
                         num_elements,
-                        rmm::device_buffer{num_elements * sizeof(int), cudf::default_stream_value},
+                        rmm::device_buffer{num_elements * sizeof(int), cudf::get_default_stream()},
                         std::move(copy_mask)};
   std::vector<cudf::size_type> indices{0,
                                        104,
@@ -669,7 +670,7 @@ TEST_F(CopyBitmaskTest, TestCopyColumnViewVectorDiscontiguous)
     cols.emplace_back(
       t,
       split[i + 1] - split[i],
-      rmm::device_buffer{sizeof(int) * (split[i + 1] - split[i]), cudf::default_stream_value},
+      rmm::device_buffer{sizeof(int) * (split[i + 1] - split[i]), cudf::get_default_stream()},
       cudf::test::detail::make_null_mask(validity_bit.begin() + split[i],
                                          validity_bit.begin() + split[i + 1]));
     views.push_back(cols.back());
diff --git a/cpp/tests/bitmask/is_element_valid_tests.cpp b/cpp/tests/bitmask/is_element_valid_tests.cpp
index 383448c0dd8..a369c179bd9 100644
--- a/cpp/tests/bitmask/is_element_valid_tests.cpp
+++ b/cpp/tests/bitmask/is_element_valid_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -23,70 +23,64 @@
 
 #include <thrust/iterator/counting_iterator.h>
 
-namespace cudf {
-namespace test {
-
-struct IsElementValidTest : public BaseFixture {
+struct IsElementValidTest : public cudf::test::BaseFixture {
 };
 
 TEST_F(IsElementValidTest, IsElementValidBasic)
 {
-  fixed_width_column_wrapper<int32_t> col({1, 1, 1, 1, 1}, {1, 0, 0, 0, 1});
-  EXPECT_TRUE(cudf::detail::is_element_valid_sync(col, 0));
-  EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 1));
-  EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 2));
-  EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 3));
-  EXPECT_TRUE(cudf::detail::is_element_valid_sync(col, 4));
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 1, 1, 1, 1}, {1, 0, 0, 0, 1});
+  EXPECT_TRUE(cudf::detail::is_element_valid_sync(col, 0, cudf::get_default_stream()));
+  EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 1, cudf::get_default_stream()));
+  EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 2, cudf::get_default_stream()));
+  EXPECT_FALSE(cudf::detail::is_element_valid_sync(col, 3, cudf::get_default_stream()));
+  EXPECT_TRUE(cudf::detail::is_element_valid_sync(col, 4, cudf::get_default_stream()));
 }
 
 TEST_F(IsElementValidTest, IsElementValidLarge)
 {
-  auto filter        = [](auto i) { return static_cast<bool>(i % 3); };
-  auto val           = thrust::make_counting_iterator(0);
-  auto valid         = cudf::detail::make_counting_transform_iterator(0, filter);
-  size_type num_rows = 1000;
+  auto filter              = [](auto i) { return static_cast<bool>(i % 3); };
+  auto val                 = thrust::make_counting_iterator(0);
+  auto valid               = cudf::detail::make_counting_transform_iterator(0, filter);
+  cudf::size_type num_rows = 1000;
 
-  fixed_width_column_wrapper<int32_t> col(val, val + num_rows, valid);
+  cudf::test::fixed_width_column_wrapper<int32_t> col(val, val + num_rows, valid);
 
   for (int i = 0; i < num_rows; i++) {
-    EXPECT_EQ(cudf::detail::is_element_valid_sync(col, i), filter(i));
+    EXPECT_EQ(cudf::detail::is_element_valid_sync(col, i, cudf::get_default_stream()), filter(i));
   }
 }
 
 TEST_F(IsElementValidTest, IsElementValidOffset)
 {
-  fixed_width_column_wrapper<int32_t> col({1, 1, 1, 1, 1}, {1, 0, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 1, 1, 1, 1}, {1, 0, 0, 0, 1});
   {
-    auto offset_col = slice(col, {1, 5}).front();
-    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 0));
-    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 1));
-    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 2));
-    EXPECT_TRUE(cudf::detail::is_element_valid_sync(offset_col, 3));
+    auto offset_col = cudf::slice(col, {1, 5}).front();
+    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 0, cudf::get_default_stream()));
+    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 1, cudf::get_default_stream()));
+    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 2, cudf::get_default_stream()));
+    EXPECT_TRUE(cudf::detail::is_element_valid_sync(offset_col, 3, cudf::get_default_stream()));
   }
   {
-    auto offset_col = slice(col, {2, 5}).front();
-    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 0));
-    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 1));
-    EXPECT_TRUE(cudf::detail::is_element_valid_sync(offset_col, 2));
+    auto offset_col = cudf::slice(col, {2, 5}).front();
+    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 0, cudf::get_default_stream()));
+    EXPECT_FALSE(cudf::detail::is_element_valid_sync(offset_col, 1, cudf::get_default_stream()));
+    EXPECT_TRUE(cudf::detail::is_element_valid_sync(offset_col, 2, cudf::get_default_stream()));
   }
 }
 
 TEST_F(IsElementValidTest, IsElementValidOffsetLarge)
 {
-  auto filter        = [](auto i) { return static_cast<bool>(i % 3); };
-  size_type offset   = 37;
-  auto val           = thrust::make_counting_iterator(0);
-  auto valid         = cudf::detail::make_counting_transform_iterator(0, filter);
-  size_type num_rows = 1000;
+  auto filter              = [](auto i) { return static_cast<bool>(i % 3); };
+  cudf::size_type offset   = 37;
+  auto val                 = thrust::make_counting_iterator(0);
+  auto valid               = cudf::detail::make_counting_transform_iterator(0, filter);
+  cudf::size_type num_rows = 1000;
 
-  fixed_width_column_wrapper<int32_t> col(val, val + num_rows, valid);
-  auto offset_col = slice(col, {offset, num_rows}).front();
+  cudf::test::fixed_width_column_wrapper<int32_t> col(val, val + num_rows, valid);
+  auto offset_col = cudf::slice(col, {offset, num_rows}).front();
 
   for (int i = 0; i < offset_col.size(); i++) {
-    EXPECT_EQ(cudf::detail::is_element_valid_sync(offset_col, i), filter(i + offset));
+    EXPECT_EQ(cudf::detail::is_element_valid_sync(offset_col, i, cudf::get_default_stream()),
+              filter(i + offset));
   }
 }
-
-}  // namespace test
-
-}  // namespace cudf
diff --git a/cpp/tests/bitmask/set_nullmask_tests.cu b/cpp/tests/bitmask/set_nullmask_tests.cu
index 1b7ffe2ff72..19e3202a6d7 100644
--- a/cpp/tests/bitmask/set_nullmask_tests.cu
+++ b/cpp/tests/bitmask/set_nullmask_tests.cu
@@ -50,7 +50,7 @@ struct SetBitmaskTest : public cudf::test::BaseFixture {
   void expect_bitmask_equal(cudf::bitmask_type const* bitmask,  // Device Ptr
                             cudf::size_type start_bit,
                             thrust::host_vector<bool> const& expect,
-                            rmm::cuda_stream_view stream = cudf::default_stream_value)
+                            rmm::cuda_stream_view stream = cudf::get_default_stream())
   {
     rmm::device_uvector<bool> result(expect.size(), stream);
     auto counting_iter = thrust::counting_iterator<cudf::size_type>{0};
diff --git a/cpp/tests/bitmask/valid_if_tests.cu b/cpp/tests/bitmask/valid_if_tests.cu
index 816a89500da..cdc453be8e4 100644
--- a/cpp/tests/bitmask/valid_if_tests.cu
+++ b/cpp/tests/bitmask/valid_if_tests.cu
@@ -40,8 +40,10 @@ struct all_null {
 
 TEST_F(ValidIfTest, EmptyRange)
 {
-  auto actual = cudf::detail::valid_if(
-    thrust::make_counting_iterator(0), thrust::make_counting_iterator(0), odds_valid{});
+  auto actual        = cudf::detail::valid_if(thrust::make_counting_iterator(0),
+                                       thrust::make_counting_iterator(0),
+                                       odds_valid{},
+                                       cudf::get_default_stream());
   auto const& buffer = actual.first;
   EXPECT_EQ(0u, buffer.size());
   EXPECT_EQ(nullptr, buffer.data());
@@ -50,18 +52,21 @@ TEST_F(ValidIfTest, EmptyRange)
 
 TEST_F(ValidIfTest, InvalidRange)
 {
-  EXPECT_THROW(
-    cudf::detail::valid_if(
-      thrust::make_counting_iterator(1), thrust::make_counting_iterator(0), odds_valid{}),
-    cudf::logic_error);
+  EXPECT_THROW(cudf::detail::valid_if(thrust::make_counting_iterator(1),
+                                      thrust::make_counting_iterator(0),
+                                      odds_valid{},
+                                      cudf::get_default_stream()),
+               cudf::logic_error);
 }
 
 TEST_F(ValidIfTest, OddsValid)
 {
   auto iter     = cudf::detail::make_counting_transform_iterator(0, odds_valid{});
   auto expected = cudf::test::detail::make_null_mask(iter, iter + 10000);
-  auto actual   = cudf::detail::valid_if(
-    thrust::make_counting_iterator(0), thrust::make_counting_iterator(10000), odds_valid{});
+  auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
+                                       thrust::make_counting_iterator(10000),
+                                       odds_valid{},
+                                       cudf::get_default_stream());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(5000, actual.second);
 }
@@ -70,8 +75,10 @@ TEST_F(ValidIfTest, AllValid)
 {
   auto iter     = cudf::detail::make_counting_transform_iterator(0, all_valid{});
   auto expected = cudf::test::detail::make_null_mask(iter, iter + 10000);
-  auto actual   = cudf::detail::valid_if(
-    thrust::make_counting_iterator(0), thrust::make_counting_iterator(10000), all_valid{});
+  auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
+                                       thrust::make_counting_iterator(10000),
+                                       all_valid{},
+                                       cudf::get_default_stream());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(0, actual.second);
 }
@@ -80,8 +87,10 @@ TEST_F(ValidIfTest, AllNull)
 {
   auto iter     = cudf::detail::make_counting_transform_iterator(0, all_null{});
   auto expected = cudf::test::detail::make_null_mask(iter, iter + 10000);
-  auto actual   = cudf::detail::valid_if(
-    thrust::make_counting_iterator(0), thrust::make_counting_iterator(10000), all_null{});
+  auto actual   = cudf::detail::valid_if(thrust::make_counting_iterator(0),
+                                       thrust::make_counting_iterator(10000),
+                                       all_null{},
+                                       cudf::get_default_stream());
   CUDF_TEST_EXPECT_EQUAL_BUFFERS(expected.data(), actual.first.data(), expected.size());
   EXPECT_EQ(10000, actual.second);
 }
diff --git a/cpp/tests/column/column_device_view_test.cu b/cpp/tests/column/column_device_view_test.cu
index c317ddec8b7..0868ad885cf 100644
--- a/cpp/tests/column/column_device_view_test.cu
+++ b/cpp/tests/column/column_device_view_test.cu
@@ -37,7 +37,7 @@ struct ColumnDeviceViewTest : public cudf::test::BaseFixture {
 TEST_F(ColumnDeviceViewTest, Sample)
 {
   using T = int32_t;
-  rmm::cuda_stream_view stream{cudf::default_stream_value};
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
   cudf::test::fixed_width_column_wrapper<T> input({1, 2, 3, 4, 5, 6});
   auto output            = cudf::allocate_like(input);
   auto input_device_view = cudf::column_device_view::create(input, stream);
@@ -55,7 +55,7 @@ TEST_F(ColumnDeviceViewTest, Sample)
 TEST_F(ColumnDeviceViewTest, MismatchingType)
 {
   using T = int32_t;
-  rmm::cuda_stream_view stream{cudf::default_stream_value};
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
   cudf::test::fixed_width_column_wrapper<T> input({1, 2, 3, 4, 5, 6});
   auto output            = cudf::allocate_like(input);
   auto input_device_view = cudf::column_device_view::create(input, stream);
diff --git a/cpp/tests/column/column_test.cu b/cpp/tests/column/column_test.cu
index 51f37ecac6c..b31c38e4187 100644
--- a/cpp/tests/column/column_test.cu
+++ b/cpp/tests/column/column_test.cu
@@ -43,15 +43,15 @@ struct TypedColumnTest : public cudf::test::BaseFixture {
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 
   TypedColumnTest()
-    : data{_num_elements * cudf::size_of(type()), cudf::default_stream_value},
-      mask{cudf::bitmask_allocation_size_bytes(_num_elements), cudf::default_stream_value}
+    : data{_num_elements * cudf::size_of(type()), cudf::get_default_stream()},
+      mask{cudf::bitmask_allocation_size_bytes(_num_elements), cudf::get_default_stream()}
   {
     auto typed_data = static_cast<char*>(data.data());
     auto typed_mask = static_cast<char*>(mask.data());
     thrust::sequence(
-      rmm::exec_policy(cudf::default_stream_value), typed_data, typed_data + data.size());
+      rmm::exec_policy(cudf::get_default_stream()), typed_data, typed_data + data.size());
     thrust::sequence(
-      rmm::exec_policy(cudf::default_stream_value), typed_mask, typed_mask + mask.size());
+      rmm::exec_policy(cudf::get_default_stream()), typed_mask, typed_mask + mask.size());
   }
 
   cudf::size_type num_elements() { return _num_elements; }
@@ -247,8 +247,8 @@ TYPED_TEST(TypedColumnTest, CopyDataAndMask)
 {
   cudf::column col{this->type(),
                    this->num_elements(),
-                   rmm::device_buffer{this->data, cudf::default_stream_value},
-                   rmm::device_buffer{this->all_valid_mask, cudf::default_stream_value}};
+                   rmm::device_buffer{this->data, cudf::get_default_stream()},
+                   rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}};
   EXPECT_EQ(this->type(), col.type());
   EXPECT_TRUE(col.nullable());
   EXPECT_EQ(0, col.null_count());
@@ -352,8 +352,8 @@ TYPED_TEST(TypedColumnTest, MoveConstructorWithMask)
 TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorNoMask)
 {
   rmm::device_uvector<TypeParam> original{static_cast<std::size_t>(this->num_elements()),
-                                          cudf::default_stream_value};
-  thrust::copy(rmm::exec_policy(cudf::default_stream_value),
+                                          cudf::get_default_stream()};
+  thrust::copy(rmm::exec_policy(cudf::get_default_stream()),
                static_cast<TypeParam*>(this->data.data()),
                static_cast<TypeParam*>(this->data.data()) + this->num_elements(),
                original.begin());
@@ -369,8 +369,8 @@ TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorNoMask)
 TYPED_TEST(TypedColumnTest, DeviceUvectorConstructorWithMask)
 {
   rmm::device_uvector<TypeParam> original{static_cast<std::size_t>(this->num_elements()),
-                                          cudf::default_stream_value};
-  thrust::copy(rmm::exec_policy(cudf::default_stream_value),
+                                          cudf::get_default_stream()};
+  thrust::copy(rmm::exec_policy(cudf::get_default_stream()),
                static_cast<TypeParam*>(this->data.data()),
                static_cast<TypeParam*>(this->data.data()) + this->num_elements(),
                original.begin());
@@ -392,17 +392,17 @@ TYPED_TEST(TypedColumnTest, ConstructWithChildren)
   children.emplace_back(std::make_unique<cudf::column>(
     cudf::data_type{cudf::type_id::INT8},
     42,
-    rmm::device_buffer{this->data, cudf::default_stream_value},
-    rmm::device_buffer{this->all_valid_mask, cudf::default_stream_value}));
+    rmm::device_buffer{this->data, cudf::get_default_stream()},
+    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}));
   children.emplace_back(std::make_unique<cudf::column>(
     cudf::data_type{cudf::type_id::FLOAT64},
     314,
-    rmm::device_buffer{this->data, cudf::default_stream_value},
-    rmm::device_buffer{this->all_valid_mask, cudf::default_stream_value}));
+    rmm::device_buffer{this->data, cudf::get_default_stream()},
+    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}));
   cudf::column col{this->type(),
                    this->num_elements(),
-                   rmm::device_buffer{this->data, cudf::default_stream_value},
-                   rmm::device_buffer{this->all_valid_mask, cudf::default_stream_value},
+                   rmm::device_buffer{this->data, cudf::get_default_stream()},
+                   rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()},
                    cudf::UNKNOWN_NULL_COUNT,
                    std::move(children)};
 
@@ -437,17 +437,17 @@ TYPED_TEST(TypedColumnTest, ReleaseWithChildren)
   children.emplace_back(std::make_unique<cudf::column>(
     this->type(),
     this->num_elements(),
-    rmm::device_buffer{this->data, cudf::default_stream_value},
-    rmm::device_buffer{this->all_valid_mask, cudf::default_stream_value}));
+    rmm::device_buffer{this->data, cudf::get_default_stream()},
+    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}));
   children.emplace_back(std::make_unique<cudf::column>(
     this->type(),
     this->num_elements(),
-    rmm::device_buffer{this->data, cudf::default_stream_value},
-    rmm::device_buffer{this->all_valid_mask, cudf::default_stream_value}));
+    rmm::device_buffer{this->data, cudf::get_default_stream()},
+    rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()}));
   cudf::column col{this->type(),
                    this->num_elements(),
-                   rmm::device_buffer{this->data, cudf::default_stream_value},
-                   rmm::device_buffer{this->all_valid_mask, cudf::default_stream_value},
+                   rmm::device_buffer{this->data, cudf::get_default_stream()},
+                   rmm::device_buffer{this->all_valid_mask, cudf::get_default_stream()},
                    cudf::UNKNOWN_NULL_COUNT,
                    std::move(children)};
 
diff --git a/cpp/tests/column/compound_test.cu b/cpp/tests/column/compound_test.cu
index 58be2b2f316..79f746369f2 100644
--- a/cpp/tests/column/compound_test.cu
+++ b/cpp/tests/column/compound_test.cu
@@ -65,13 +65,13 @@ struct checker_for_level2 {
 
 TEST_F(CompoundColumnTest, ChildrenLevel1)
 {
-  rmm::device_uvector<int32_t> data(1000, cudf::default_stream_value);
-  thrust::sequence(rmm::exec_policy(cudf::default_stream_value), data.begin(), data.end(), 1);
+  rmm::device_uvector<int32_t> data(1000, cudf::get_default_stream());
+  thrust::sequence(rmm::exec_policy(cudf::get_default_stream()), data.begin(), data.end(), 1);
 
   auto null_mask = cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED);
-  rmm::device_buffer data1{data.data() + 100, 100 * sizeof(int32_t), cudf::default_stream_value};
-  rmm::device_buffer data2{data.data() + 200, 100 * sizeof(int32_t), cudf::default_stream_value};
-  rmm::device_buffer data3{data.data() + 300, 100 * sizeof(int32_t), cudf::default_stream_value};
+  rmm::device_buffer data1{data.data() + 100, 100 * sizeof(int32_t), cudf::get_default_stream()};
+  rmm::device_buffer data2{data.data() + 200, 100 * sizeof(int32_t), cudf::get_default_stream()};
+  rmm::device_buffer data3{data.data() + 300, 100 * sizeof(int32_t), cudf::get_default_stream()};
   auto child1 =
     std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
                                    100,
@@ -105,14 +105,14 @@ TEST_F(CompoundColumnTest, ChildrenLevel1)
 
   {
     auto column = cudf::column_device_view::create(parent->view());
-    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::default_stream_value),
+    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::get_default_stream()),
                                thrust::make_counting_iterator<int32_t>(0),
                                thrust::make_counting_iterator<int32_t>(100),
                                checker_for_level1<cudf::column_device_view>{*column}));
   }
   {
     auto column = cudf::mutable_column_device_view::create(parent->mutable_view());
-    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::default_stream_value),
+    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::get_default_stream()),
                                thrust::make_counting_iterator<int32_t>(0),
                                thrust::make_counting_iterator<int32_t>(100),
                                checker_for_level1<cudf::mutable_column_device_view>{*column}));
@@ -121,16 +121,16 @@ TEST_F(CompoundColumnTest, ChildrenLevel1)
 
 TEST_F(CompoundColumnTest, ChildrenLevel2)
 {
-  rmm::device_uvector<int32_t> data(1000, cudf::default_stream_value);
-  thrust::sequence(rmm::exec_policy(cudf::default_stream_value), data.begin(), data.end(), 1);
+  rmm::device_uvector<int32_t> data(1000, cudf::get_default_stream());
+  thrust::sequence(rmm::exec_policy(cudf::get_default_stream()), data.begin(), data.end(), 1);
 
   auto null_mask = cudf::create_null_mask(100, cudf::mask_state::UNALLOCATED);
-  rmm::device_buffer data11{data.data() + 100, 100 * sizeof(int32_t), cudf::default_stream_value};
-  rmm::device_buffer data12{data.data() + 200, 100 * sizeof(int32_t), cudf::default_stream_value};
-  rmm::device_buffer data13{data.data() + 300, 100 * sizeof(int32_t), cudf::default_stream_value};
-  rmm::device_buffer data21{data.data() + 400, 100 * sizeof(int32_t), cudf::default_stream_value};
-  rmm::device_buffer data22{data.data() + 500, 100 * sizeof(int32_t), cudf::default_stream_value};
-  rmm::device_buffer data23{data.data() + 600, 100 * sizeof(int32_t), cudf::default_stream_value};
+  rmm::device_buffer data11{data.data() + 100, 100 * sizeof(int32_t), cudf::get_default_stream()};
+  rmm::device_buffer data12{data.data() + 200, 100 * sizeof(int32_t), cudf::get_default_stream()};
+  rmm::device_buffer data13{data.data() + 300, 100 * sizeof(int32_t), cudf::get_default_stream()};
+  rmm::device_buffer data21{data.data() + 400, 100 * sizeof(int32_t), cudf::get_default_stream()};
+  rmm::device_buffer data22{data.data() + 500, 100 * sizeof(int32_t), cudf::get_default_stream()};
+  rmm::device_buffer data23{data.data() + 600, 100 * sizeof(int32_t), cudf::get_default_stream()};
   auto gchild11 =
     std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
                                    100,
@@ -202,14 +202,14 @@ TEST_F(CompoundColumnTest, ChildrenLevel2)
 
   {
     auto column = cudf::column_device_view::create(parent->view());
-    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::default_stream_value),
+    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::get_default_stream()),
                                thrust::make_counting_iterator<int32_t>(0),
                                thrust::make_counting_iterator<int32_t>(100),
                                checker_for_level2<cudf::column_device_view>{*column}));
   }
   {
     auto column = cudf::mutable_column_device_view::create(parent->mutable_view());
-    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::default_stream_value),
+    EXPECT_TRUE(thrust::any_of(rmm::exec_policy(cudf::get_default_stream()),
                                thrust::make_counting_iterator<int32_t>(0),
                                thrust::make_counting_iterator<int32_t>(100),
                                checker_for_level2<cudf::mutable_column_device_view>{*column}));
diff --git a/cpp/tests/column/factories_test.cpp b/cpp/tests/column/factories_test.cpp
index e8098202fc3..9386bb9a840 100644
--- a/cpp/tests/column/factories_test.cpp
+++ b/cpp/tests/column/factories_test.cpp
@@ -36,7 +36,7 @@ class ColumnFactoryTest : public cudf::test::BaseFixture {
 
  public:
   cudf::size_type size() { return _size; }
-  rmm::cuda_stream_view stream() { return cudf::default_stream_value; }
+  rmm::cuda_stream_view stream() { return cudf::get_default_stream(); }
 };
 
 template <typename T>
@@ -423,6 +423,7 @@ TEST_F(ColumnFactoryTest, FromStringScalar)
   EXPECT_EQ(0, column->null_count());
   EXPECT_FALSE(column->nullable());
   EXPECT_FALSE(column->has_nulls());
+  EXPECT_TRUE(column->num_children() > 0);
 }
 
 TEST_F(ColumnFactoryTest, FromNullStringScalar)
@@ -434,6 +435,7 @@ TEST_F(ColumnFactoryTest, FromNullStringScalar)
   EXPECT_EQ(2, column->null_count());
   EXPECT_TRUE(column->nullable());
   EXPECT_TRUE(column->has_nulls());
+  EXPECT_TRUE(column->num_children() > 0);
 }
 
 TEST_F(ColumnFactoryTest, FromStringScalarWithZeroSize)
diff --git a/cpp/tests/copying/concatenate_tests.cu b/cpp/tests/copying/concatenate_tests.cu
index c06afe85cff..f01a17a0005 100644
--- a/cpp/tests/copying/concatenate_tests.cu
+++ b/cpp/tests/copying/concatenate_tests.cu
@@ -53,7 +53,7 @@ template <typename T>
 struct TypedColumnTest : public cudf::test::BaseFixture {
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 
-  TypedColumnTest(rmm::cuda_stream_view stream = cudf::default_stream_value)
+  TypedColumnTest(rmm::cuda_stream_view stream = cudf::get_default_stream())
     : data{_num_elements * cudf::size_of(type()), stream},
       mask{cudf::bitmask_allocation_size_bytes(_num_elements), stream}
   {
@@ -356,7 +356,7 @@ TEST_F(OverflowTest, OverflowTest)
     table_view tbl_last({*many_chars_last});
     std::vector<cudf::table_view> table_views_to_concat({tbl, tbl, tbl, tbl, tbl, tbl_last});
     std::unique_ptr<cudf::table> concatenated_tables = cudf::concatenate(table_views_to_concat);
-    EXPECT_NO_THROW(cudf::default_stream_value.synchronize());
+    EXPECT_NO_THROW(cudf::get_default_stream().synchronize());
     ASSERT_EQ(concatenated_tables->num_rows(), std::numeric_limits<size_type>::max());
   }
 
@@ -522,11 +522,11 @@ TEST_F(OverflowTest, Presliced)
 
     // try and concatenate 4 string columns of with ~1/2 billion chars in each
     auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows + 1);
-    thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+    thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                  offsets->mutable_view().begin<offset_type>(),
                  offsets->mutable_view().end<offset_type>(),
                  string_size);
-    thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value),
+    thrust::exclusive_scan(rmm::exec_policy(cudf::get_default_stream()),
                            offsets->view().begin<offset_type>(),
                            offsets->view().end<offset_type>(),
                            offsets->mutable_view().begin<offset_type>());
@@ -596,11 +596,11 @@ TEST_F(OverflowTest, Presliced)
 
     // try and concatenate 4 struct columns of with ~1/2 billion elements in each
     auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows + 1);
-    thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+    thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                  offsets->mutable_view().begin<offset_type>(),
                  offsets->mutable_view().end<offset_type>(),
                  list_size);
-    thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value),
+    thrust::exclusive_scan(rmm::exec_policy(cudf::get_default_stream()),
                            offsets->view().begin<offset_type>(),
                            offsets->view().end<offset_type>(),
                            offsets->mutable_view().begin<offset_type>());
@@ -688,11 +688,11 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
     constexpr size_type string_size = inner_size / num_rows;
 
     auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows + 1);
-    thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+    thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                  offsets->mutable_view().begin<offset_type>(),
                  offsets->mutable_view().end<offset_type>(),
                  string_size);
-    thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value),
+    thrust::exclusive_scan(rmm::exec_policy(cudf::get_default_stream()),
                            offsets->view().begin<offset_type>(),
                            offsets->view().end<offset_type>(),
                            offsets->mutable_view().begin<offset_type>());
@@ -715,11 +715,11 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
     constexpr size_type list_size = inner_size / num_rows;
 
     auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows + 1);
-    thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+    thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                  offsets->mutable_view().begin<offset_type>(),
                  offsets->mutable_view().end<offset_type>(),
                  list_size);
-    thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value),
+    thrust::exclusive_scan(rmm::exec_policy(cudf::get_default_stream()),
                            offsets->view().begin<offset_type>(),
                            offsets->view().end<offset_type>(),
                            offsets->mutable_view().begin<offset_type>());
@@ -742,11 +742,11 @@ TEST_F(OverflowTest, BigColumnsSmallSlices)
     constexpr size_type list_size = inner_size / num_rows;
 
     auto offsets = cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows + 1);
-    thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+    thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                  offsets->mutable_view().begin<offset_type>(),
                  offsets->mutable_view().end<offset_type>(),
                  list_size);
-    thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value),
+    thrust::exclusive_scan(rmm::exec_policy(cudf::get_default_stream()),
                            offsets->view().begin<offset_type>(),
                            offsets->view().end<offset_type>(),
                            offsets->mutable_view().begin<offset_type>());
diff --git a/cpp/tests/copying/detail_gather_tests.cu b/cpp/tests/copying/detail_gather_tests.cu
index e3cd975ab41..bf2937ae8ab 100644
--- a/cpp/tests/copying/detail_gather_tests.cu
+++ b/cpp/tests/copying/detail_gather_tests.cu
@@ -13,7 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <tests/strings/utilities.h>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
@@ -24,13 +30,6 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
 #include <rmm/device_uvector.hpp>
 
 #include <thrust/execution_policy.h>
@@ -47,9 +46,9 @@ TYPED_TEST_SUITE(GatherTest, cudf::test::NumericTypes);
 TYPED_TEST(GatherTest, GatherDetailDeviceVectorTest)
 {
   constexpr cudf::size_type source_size{1000};
-  rmm::device_uvector<cudf::size_type> gather_map(source_size, cudf::default_stream_value);
+  rmm::device_uvector<cudf::size_type> gather_map(source_size, cudf::get_default_stream());
   thrust::sequence(
-    rmm::exec_policy(cudf::default_stream_value), gather_map.begin(), gather_map.end());
+    rmm::exec_policy_nosync(cudf::get_default_stream()), gather_map.begin(), gather_map.end());
 
   auto data = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i; });
   cudf::test::fixed_width_column_wrapper<TypeParam> source_column(data, data + source_size);
@@ -97,7 +96,8 @@ TYPED_TEST(GatherTest, GatherDetailInvalidIndexTest)
     cudf::detail::gather(source_table,
                          gather_map,
                          cudf::out_of_bounds_policy::NULLIFY,
-                         cudf::detail::negative_index_policy::NOT_ALLOWED);
+                         cudf::detail::negative_index_policy::NOT_ALLOWED,
+                         cudf::get_default_stream());
 
   auto expect_data =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i % 2) ? 0 : i; });
diff --git a/cpp/tests/copying/gather_list_tests.cpp b/cpp/tests/copying/gather_list_tests.cpp
index b26ee90c3b9..b0a0f99361a 100644
--- a/cpp/tests/copying/gather_list_tests.cpp
+++ b/cpp/tests/copying/gather_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,7 +13,13 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <tests/strings/utilities.h>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
@@ -23,13 +29,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
 template <typename T>
 class GatherTestListTyped : public cudf::test::BaseFixture {
 };
@@ -267,7 +266,8 @@ TYPED_TEST(GatherTestListTyped, GatherDetailInvalidIndex)
     auto results = cudf::detail::gather(source_table,
                                         gather_map,
                                         cudf::out_of_bounds_policy::NULLIFY,
-                                        cudf::detail::negative_index_policy::NOT_ALLOWED);
+                                        cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                        cudf::get_default_stream());
 
     std::vector<int32_t> expected_validity{1, 0, 0, 1};
     LCW<T> expected{{{{2, 3}, {4, 5}},
diff --git a/cpp/tests/copying/gather_str_tests.cpp b/cpp/tests/copying/gather_str_tests.cpp
index a9a9a4f9342..3db2ce399cc 100644
--- a/cpp/tests/copying/gather_str_tests.cpp
+++ b/cpp/tests/copying/gather_str_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,17 +13,17 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/gather.hpp>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/gather.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
 
 class GatherTestStr : public cudf::test::BaseFixture {
 };
@@ -86,7 +86,8 @@ TEST_F(GatherTestStr, Gather)
   auto results = cudf::detail::gather(source_table,
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
-                                      cudf::detail::negative_index_policy::NOT_ALLOWED);
+                                      cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                      cudf::get_default_stream());
 
   std::vector<const char*> h_expected;
   std::vector<int32_t> expected_validity;
@@ -116,7 +117,8 @@ TEST_F(GatherTestStr, GatherDontCheckOutOfBounds)
   auto results = cudf::detail::gather(source_table,
                                       gather_map,
                                       cudf::out_of_bounds_policy::DONT_CHECK,
-                                      cudf::detail::negative_index_policy::NOT_ALLOWED);
+                                      cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                      cudf::get_default_stream());
 
   std::vector<const char*> h_expected;
   for (auto itr = h_map.begin(); itr != h_map.end(); ++itr) {
@@ -134,8 +136,9 @@ TEST_F(GatherTestStr, GatherEmptyMapStringsColumn)
   auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column}),
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
-                                      cudf::detail::negative_index_policy::NOT_ALLOWED);
-  cudf::test::expect_strings_empty(results->get_column(0).view());
+                                      cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                      cudf::get_default_stream());
+  cudf::test::expect_column_empty(results->get_column(0).view());
 }
 
 TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
@@ -147,6 +150,7 @@ TEST_F(GatherTestStr, GatherZeroSizeStringsColumn)
   auto results = cudf::detail::gather(cudf::table_view({zero_size_strings_column}),
                                       gather_map,
                                       cudf::out_of_bounds_policy::NULLIFY,
-                                      cudf::detail::negative_index_policy::NOT_ALLOWED);
+                                      cudf::detail::negative_index_policy::NOT_ALLOWED,
+                                      cudf::get_default_stream());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, results->get_column(0).view());
 }
diff --git a/cpp/tests/copying/gather_tests.cpp b/cpp/tests/copying/gather_tests.cpp
index 141503ed978..9c8d6102000 100644
--- a/cpp/tests/copying/gather_tests.cpp
+++ b/cpp/tests/copying/gather_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -13,13 +13,6 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <tests/strings/utilities.h>
-
-#include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
-#include <cudf/detail/iterator.cuh>
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
 
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
@@ -28,6 +21,12 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
+#include <cudf/column/column_view.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+
 template <typename T>
 class GatherTest : public cudf::test::BaseFixture {
 };
diff --git a/cpp/tests/copying/get_value_tests.cpp b/cpp/tests/copying/get_value_tests.cpp
index 6d903cca020..2538cd9d851 100644
--- a/cpp/tests/copying/get_value_tests.cpp
+++ b/cpp/tests/copying/get_value_tests.cpp
@@ -81,8 +81,9 @@ TYPED_TEST(FixedWidthGetValueTest, IndexOutOfBounds)
 {
   fixed_width_column_wrapper<TypeParam, int32_t> col({9, 8, 7, 6}, {0, 1, 0, 1});
 
-  CUDF_EXPECT_THROW_MESSAGE(get_element(col, -1);, "Index out of bounds");
-  CUDF_EXPECT_THROW_MESSAGE(get_element(col, 4);, "Index out of bounds");
+  // Test for out of bounds indexes in both directions.
+  EXPECT_THROW(get_element(col, -1), cudf::logic_error);
+  EXPECT_THROW(get_element(col, 4), cudf::logic_error);
 }
 
 struct StringGetValueTest : public BaseFixture {
diff --git a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
index 77fd3f66ee5..6dfd038b05c 100644
--- a/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
+++ b/cpp/tests/copying/purge_nonempty_nulls_tests.cpp
@@ -29,35 +29,33 @@
 #include <cudf_test/cudf_gtest.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 
-namespace cudf::test {
-
-using iterators::no_nulls;
-using iterators::null_at;
-using iterators::nulls_at;
+using cudf::test::iterators::no_nulls;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
 using T             = int32_t;  // The actual type of the leaf node isn't really important.
-using values_col_t  = fixed_width_column_wrapper<T>;
-using offsets_col_t = fixed_width_column_wrapper<size_type>;
-using gather_map_t  = fixed_width_column_wrapper<size_type>;
+using values_col_t  = cudf::test::fixed_width_column_wrapper<T>;
+using offsets_col_t = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
+using gather_map_t  = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
 template <typename T>
 using LCW = cudf::test::lists_column_wrapper<T, int32_t>;
 
 struct PurgeNonEmptyNullsTest : public cudf::test::BaseFixture {
   /// Helper to run gather() on a single column, and extract the single column from the result.
-  std::unique_ptr<cudf::column> gather(column_view const& input, gather_map_t const& gather_map)
+  std::unique_ptr<cudf::column> gather(cudf::column_view const& input,
+                                       gather_map_t const& gather_map)
   {
     auto gathered =
-      cudf::gather(cudf::table_view{{input}}, gather_map, out_of_bounds_policy::NULLIFY);
+      cudf::gather(cudf::table_view{{input}}, gather_map, cudf::out_of_bounds_policy::NULLIFY);
     return std::move(gathered->release()[0]);
   }
 
   /// Verify that the result of `sanitize()` is equivalent to the unsanitized input,
   /// except that the null rows are also empty.
-  template <typename ColumnViewT>
-  void test_purge(ColumnViewT const& unpurged)
+  void test_purge(cudf::column_view const& unpurged)
   {
     auto const purged = cudf::purge_nonempty_nulls(unpurged);
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(unpurged.parent(), *purged);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(unpurged, *purged);
     EXPECT_FALSE(cudf::has_nonempty_nulls(*purged));
   }
 };
@@ -75,16 +73,17 @@ TEST_F(PurgeNonEmptyNullsTest, SingleLevelList)
   EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
 
   // Set nullmask, post construction.
-  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false);
+  cudf::detail::set_null_mask(
+    input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
-  test_purge(lists_column_view{*input});
+  test_purge(*input);
 
   {
     // Selecting all rows from input, in different order.
     auto const results           = gather(input->view(), {1, 2, 0, 3});
-    auto const results_list_view = lists_column_view(*results);
+    auto const results_list_view = cudf::lists_column_view(*results);
 
     auto const expected = LCW<T>{{{5},
                                   {},  // NULL.
@@ -128,7 +127,7 @@ TEST_F(PurgeNonEmptyNullsTest, SingleLevelList)
   {
     // Test when gather selects unsanitized row specifically.
     auto const results            = gather(input->view(), {2});
-    auto const results_lists_view = lists_column_view(*results);
+    auto const results_lists_view = cudf::lists_column_view(*results);
     auto const expected           = LCW<T>{{
                                    LCW<T>{}  // NULL.
                                  },
@@ -157,16 +156,17 @@ TEST_F(PurgeNonEmptyNullsTest, TwoLevelList)
   EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
 
   // Set nullmask, post construction.
-  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false);
+  cudf::detail::set_null_mask(
+    input->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
-  test_purge(lists_column_view{*input});
+  test_purge(*input);
 
   {
     // Verify that gather() output is sanitized.
     auto const results            = gather(input->view(), {100, 3, 0, 1});
-    auto const results_lists_view = lists_column_view(*results);
+    auto const results_lists_view = cudf::lists_column_view(*results);
 
     auto const expected = LCW<T>{{
                                    LCW<T>{},  // NULL, because of out of bounds.
@@ -183,7 +183,7 @@ TEST_F(PurgeNonEmptyNullsTest, TwoLevelList)
       LCW<T>{
         {1, 2, 3}, {4, 5, 6, 7}, {8}, {9, 1}, {2}, {11, 12}, {13, 14, 15}, {16, 17, 18}, {19}});
 
-    auto const child_lists_view = lists_column_view(results_lists_view.child());
+    auto const child_lists_view = cudf::lists_column_view(results_lists_view.child());
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_lists_view.offsets(),
                                    offsets_col_t{0, 3, 7, 8, 10, 11, 13, 16, 19, 20});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(
@@ -211,15 +211,16 @@ TEST_F(PurgeNonEmptyNullsTest, ThreeLevelList)
   EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
 
   // Set nullmask, post construction.
-  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 3, 4, false);
+  cudf::detail::set_null_mask(
+    input->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
-  test_purge(lists_column_view{*input});
+  test_purge(*input);
 
   {
     auto const results            = gather(input->view(), {100, 3, 0, 1});
-    auto const results_lists_view = lists_column_view(*results);
+    auto const results_lists_view = cudf::lists_column_view(*results);
 
     auto const expected = LCW<T>{
       {
@@ -250,7 +251,7 @@ TEST_F(PurgeNonEmptyNullsTest, ThreeLevelList)
 // List<string>.
 TEST_F(PurgeNonEmptyNullsTest, ListOfStrings)
 {
-  using T = string_view;
+  using T = cudf::string_view;
 
   auto const input = LCW<T>{{{{"1", "22", "", "4444"}, null_at(2)},
                              {"55555"},
@@ -264,16 +265,17 @@ TEST_F(PurgeNonEmptyNullsTest, ListOfStrings)
   EXPECT_FALSE(cudf::has_nonempty_nulls(*input));
 
   // Set nullmask, post construction.
-  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 2, 3, false);
+  cudf::detail::set_null_mask(
+    input->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*input));
 
-  test_purge(lists_column_view{*input});
+  test_purge(*input);
 
   {
     // Selecting all rows from input, in different order.
     auto const results           = gather(input->view(), {1, 2, 0, 3});
-    auto const results_list_view = lists_column_view(*results);
+    auto const results_list_view = cudf::lists_column_view(*results);
 
     auto const expected = LCW<T>{{{"55555"},
                                   {},  // NULL.
@@ -285,7 +287,7 @@ TEST_F(PurgeNonEmptyNullsTest, ListOfStrings)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 1, 1, 5, 8});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(
       results_list_view.child(),
-      strings_column_wrapper{
+      cudf::test::strings_column_wrapper{
         {"55555", "1", "22", "", "4444", "88888888", "999999999", "1010101010"}, null_at(3)});
     EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
     EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
@@ -297,7 +299,7 @@ TEST_F(PurgeNonEmptyNullsTest, ListOfStrings)
     EXPECT_TRUE(cudf::has_nonempty_nulls(sliced));
 
     auto const results           = gather(sliced, {1, 2, 0, 3});
-    auto const results_list_view = lists_column_view(*results);
+    auto const results_list_view = cudf::lists_column_view(*results);
     auto const expected          = LCW<T>{{
                                    {},
                                    {"88888888", "999999999", "1010101010"},
@@ -309,7 +311,7 @@ TEST_F(PurgeNonEmptyNullsTest, ListOfStrings)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_list_view.offsets(), offsets_col_t{0, 0, 3, 4, 8});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(
       results_list_view.child(),
-      strings_column_wrapper{
+      cudf::test::strings_column_wrapper{
         "88888888", "999999999", "1010101010", "55555", "11", "22", "33", "44"});
     EXPECT_TRUE(cudf::may_have_nonempty_nulls(*results));
     EXPECT_FALSE(cudf::has_nonempty_nulls(*results));
@@ -320,7 +322,7 @@ TEST_F(PurgeNonEmptyNullsTest, ListOfStrings)
 TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings)
 {
   auto strings =
-    strings_column_wrapper{
+    cudf::test::strings_column_wrapper{
       {"1", "22", "3", "44", "5", "66", "7", "8888", "9", "1010"},  //<--- "8888" will be
                                                                     // unsanitized.
       no_nulls()}
@@ -329,57 +331,59 @@ TEST_F(PurgeNonEmptyNullsTest, UnsanitizedListOfUnsanitizedStrings)
   EXPECT_FALSE(cudf::has_nonempty_nulls(*strings));
 
   // Set strings nullmask, post construction.
-  set_null_mask(strings->mutable_view().null_mask(), 7, 8, false);
+  cudf::set_null_mask(strings->mutable_view().null_mask(), 7, 8, false);
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*strings));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*strings));
 
-  test_purge(strings_column_view{*strings});
+  test_purge(*strings);
 
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    strings_column_view(*strings).offsets(), offsets_col_t{0, 1, 3, 4, 6, 7, 9, 10, 14, 15, 19}
-    // 10-14 indicates that "8888" is unsanitized.
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::strings_column_view(*strings).offsets(),
+                                 offsets_col_t{0, 1, 3, 4, 6, 7, 9, 10, 14, 15, 19}
+                                 // 10-14 indicates that "8888" is unsanitized.
   );
 
   // Construct a list column from the strings column.
-  auto const lists = make_lists_column(4,
-                                       offsets_col_t{0, 4, 5, 7, 10}.release(),
-                                       std::move(strings),
-                                       0,
-                                       detail::make_null_mask(no_nulls(), no_nulls() + 4));
+  auto const lists =
+    cudf::make_lists_column(4,
+                            offsets_col_t{0, 4, 5, 7, 10}.release(),
+                            std::move(strings),
+                            0,
+                            cudf::test::detail::make_null_mask(no_nulls(), no_nulls() + 4));
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*lists));
 
   // Set lists nullmask, post construction.
-  cudf::detail::set_null_mask(lists->mutable_view().null_mask(), 2, 3, false);
+  cudf::detail::set_null_mask(
+    lists->mutable_view().null_mask(), 2, 3, false, cudf::get_default_stream());
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*lists));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*lists));
 
-  test_purge(lists_column_view{*lists});
+  test_purge(*lists);
 
   // At this point,
   // 1. {"66", "7"} will be unsanitized.
   // 2. {"8888", "9", "1010"} will be actually be {NULL, "9", "1010"}.
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
-    lists_column_view(*lists).offsets(),
+    cudf::lists_column_view(*lists).offsets(),
     offsets_col_t{0, 4, 5, 7, 10});  // 5-7 indicates that list row#2 is unsanitized.
 
   auto const result   = gather(lists->view(), {1, 2, 0, 3});
-  auto const expected = LCW<string_view>{{{"5"},
-                                          {},  // NULL.
-                                          {"1", "22", "3", "44"},
-                                          {{"", "9", "1010"}, null_at(0)}},
-                                         null_at(1)};
+  auto const expected = LCW<cudf::string_view>{{{"5"},
+                                                {},  // NULL.
+                                                {"1", "22", "3", "44"},
+                                                {{"", "9", "1010"}, null_at(0)}},
+                                               null_at(1)};
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
 
   // Ensure row#2 has been sanitized.
-  auto const results_lists_view = lists_column_view(*result);
+  auto const results_lists_view = cudf::lists_column_view(*result);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_lists_view.offsets(), offsets_col_t{0, 1, 1, 5, 8}
                                  // 1-1 indicates that row#2 is sanitized.
   );
 
   // Ensure that "8888" has been sanitized, and stored as "".
-  auto const child_strings_view = strings_column_view(results_lists_view.child());
+  auto const child_strings_view = cudf::strings_column_view(results_lists_view.child());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(child_strings_view.offsets(),
                                  offsets_col_t{0, 1, 2, 4, 5, 7, 7, 8, 12});
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*result));
@@ -397,18 +401,18 @@ TEST_F(PurgeNonEmptyNullsTest, StructOfList)
                            {8, 9, 10}},
                           no_nulls()};
       EXPECT_FALSE(cudf::has_nonempty_nulls(child));
-      return structs_column_wrapper{{child}, null_at(2)};
+      return cudf::test::structs_column_wrapper{{child}, null_at(2)};
     }()
       .release();
 
   EXPECT_TRUE(cudf::may_have_nonempty_nulls(*structs_input));
   EXPECT_TRUE(cudf::has_nonempty_nulls(*structs_input));
 
-  test_purge(structs_column_view{*structs_input});
+  test_purge(*structs_input);
 
   // At this point, even though the structs column has a null at index 2,
   // the child column has a non-empty list row at index 2: {6, 7}.
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(lists_column_view(structs_input->child(0)).child(),
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(cudf::lists_column_view(structs_input->child(0)).child(),
                                  values_col_t{{1, 2, 3, 4, 5, 6, 7, 8, 9, 10}, null_at(2)});
 
   {
@@ -421,11 +425,11 @@ TEST_F(PurgeNonEmptyNullsTest, StructOfList)
                            {{1, 2, 3, 4}, null_at(2)},
                            {8, 9, 10}},
                           null_at(1)};
-      return structs_column_wrapper{{child}, null_at(1)};
+      return cudf::test::structs_column_wrapper{{child}, null_at(1)};
     }();
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), expected_result);
-    auto const results_child = lists_column_view(result->child(0));
+    auto const results_child = cudf::lists_column_view(result->child(0));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.offsets(), offsets_col_t{0, 1, 1, 5, 8});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results_child.child(),
                                    values_col_t{{5, 1, 2, 3, 4, 8, 9, 10}, null_at(3)});
@@ -433,5 +437,3 @@ TEST_F(PurgeNonEmptyNullsTest, StructOfList)
     EXPECT_FALSE(cudf::has_nonempty_nulls(*result));
   }
 }
-
-}  // namespace cudf::test
diff --git a/cpp/tests/copying/scatter_list_scalar_tests.cpp b/cpp/tests/copying/scatter_list_scalar_tests.cpp
index 7d3de9b6c15..40b5dcff7b6 100644
--- a/cpp/tests/copying/scatter_list_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_list_scalar_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -38,7 +38,7 @@ std::unique_ptr<column> single_scalar_scatter(column_view const& target,
 {
   std::vector<std::reference_wrapper<const scalar>> slrs{slr};
   table_view targets{{target}};
-  auto result = scatter(slrs, scatter_map, targets, true);
+  auto result = scatter(slrs, scatter_map, targets);
   return std::move(result->release()[0]);
 }
 
diff --git a/cpp/tests/copying/scatter_list_tests.cpp b/cpp/tests/copying/scatter_list_tests.cpp
index 0c12f10137a..82e2366d0b9 100644
--- a/cpp/tests/copying/scatter_list_tests.cpp
+++ b/cpp/tests/copying/scatter_list_tests.cpp
@@ -14,7 +14,12 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
 
 #include <cudf/column/column_view.hpp>
 #include <cudf/copying.hpp>
@@ -24,13 +29,6 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/cudf_gtest.hpp>
-#include <cudf_test/table_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
 template <typename T>
 class TypedScatterListsTest : public cudf::test::BaseFixture {
 };
@@ -69,7 +67,7 @@ TYPED_TEST(TypedScatterListsTest, SlicedInputLists)
   auto src_list_column =
     lists_column_wrapper<T, int32_t>{{0, 0, 0, 0}, {9, 9, 9, 9}, {8, 8, 8}, {7, 7, 7}}.release();
   auto src_sliced =
-    cudf::detail::slice(src_list_column->view(), {1, 3}, cudf::default_stream_value).front();
+    cudf::detail::slice(src_list_column->view(), {1, 3}, cudf::get_default_stream()).front();
 
   auto target_list_column =
     lists_column_wrapper<T, int32_t>{{0, 0}, {1, 1}, {2, 2}, {3, 3}, {4, 4}, {5, 5}, {6, 6}}
@@ -86,7 +84,7 @@ TYPED_TEST(TypedScatterListsTest, SlicedInputLists)
       {8, 8, 8}, {1, 1}, {9, 9, 9, 9}, {3, 3}, {4, 4}, {5, 5}, {6, 6}});
 
   auto target_sliced =
-    cudf::detail::slice(target_list_column->view(), {1, 6}, cudf::default_stream_value);
+    cudf::detail::slice(target_list_column->view(), {1, 6}, cudf::get_default_stream());
 
   auto ret_2 =
     cudf::scatter(cudf::table_view({src_sliced}), scatter_map, cudf::table_view({target_sliced}));
@@ -508,12 +506,12 @@ TYPED_TEST(TypedScatterListsTest, ListsOfStructs)
 
   // clang-format off
   auto source_numerics = numerics_column{
-    9, 9, 9, 9, 
+    9, 9, 9, 9,
     8, 8, 8
   };
 
   auto source_strings = strings_column_wrapper{
-    "nine", "nine", "nine", "nine", 
+    "nine", "nine", "nine", "nine",
     "eight", "eight", "eight"
   };
   // clang-format on
@@ -525,20 +523,20 @@ TYPED_TEST(TypedScatterListsTest, ListsOfStructs)
 
   // clang-format off
   auto target_ints    = numerics_column{
-    0, 0, 
-    1, 1, 
-    2, 2, 
-    3, 3, 
-    4, 4, 
+    0, 0,
+    1, 1,
+    2, 2,
+    3, 3,
+    4, 4,
     5, 5
   };
 
   auto target_strings = strings_column_wrapper{
-    "zero",  "zero", 
-    "one",   "one", 
-    "two",   "two", 
-    "three", "three", 
-    "four",  "four", 
+    "zero",  "zero",
+    "one",   "one",
+    "two",   "two",
+    "three", "three",
+    "four",  "four",
     "five",  "five"
   };
   // clang-format on
@@ -556,9 +554,9 @@ TYPED_TEST(TypedScatterListsTest, ListsOfStructs)
 
   // clang-format off
   auto expected_numerics = numerics_column{
-    8, 8, 8, 
-    1, 1, 
-    9, 9, 9, 9, 
+    8, 8, 8,
+    1, 1,
+    9, 9, 9, 9,
     3, 3, 4, 4, 5, 5
   };
 
@@ -589,18 +587,18 @@ TYPED_TEST(TypedScatterListsTest, ListsOfStructsWithNullMembers)
 
   // clang-format off
   auto source_numerics = numerics_column{
-    { 
-      9, 9, 9, 9, 
-      8, 8, 8    
-    }, 
+    {
+      9, 9, 9, 9,
+      8, 8, 8
+    },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; })
   };
 
   auto source_strings = strings_column_wrapper{
     {
-      "nine",  "nine",  "nine", "nine", 
+      "nine",  "nine",  "nine", "nine",
       "eight", "eight", "eight"
-    }, 
+    },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })
   };
   // clang-format on
@@ -612,20 +610,20 @@ TYPED_TEST(TypedScatterListsTest, ListsOfStructsWithNullMembers)
 
   // clang-format off
   auto target_ints    = numerics_column{
-    0, 0, 
-    1, 1, 
-    2, 2, 
-    3, 3, 
-    4, 4, 
+    0, 0,
+    1, 1,
+    2, 2,
+    3, 3,
+    4, 4,
     5, 5
   };
 
   auto target_strings = strings_column_wrapper{
-    "zero", "zero", 
-    "one",  "one", 
-    "two",  "two", 
-    "three","three", 
-    "four", "four", 
+    "zero", "zero",
+    "one",  "one",
+    "two",  "two",
+    "three","three",
+    "four", "four",
     "five", "five"
   };
   // clang-format on
@@ -645,13 +643,13 @@ TYPED_TEST(TypedScatterListsTest, ListsOfStructsWithNullMembers)
   // clang-format off
   auto expected_numerics = numerics_column{
     {
-      8, 8, 8, 
-      1, 1, 
-      9, 9, 9, 9, 
-      3, 3, 
-      4, 4, 
+      8, 8, 8,
+      1, 1,
+      9, 9, 9, 9,
+      3, 3,
+      4, 4,
       5, 5
-    }, 
+    },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 8; })
   };
 
@@ -686,17 +684,17 @@ TYPED_TEST(TypedScatterListsTest, ListsOfNullStructs)
   // clang-format off
   auto source_numerics = numerics_column{
     {
-      9, 9, 9, 9, 
+      9, 9, 9, 9,
       8, 8, 8
-    }, 
+    },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; })
   };
 
   auto source_strings = strings_column_wrapper{
     {
-      "nine",  "nine",  "nine", "nine", 
+      "nine",  "nine",  "nine", "nine",
       "eight", "eight", "eight"
-    }, 
+    },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })
   };
   // clang-format on
@@ -710,20 +708,20 @@ TYPED_TEST(TypedScatterListsTest, ListsOfNullStructs)
 
   // clang-format off
   auto target_ints    = numerics_column{
-    0, 0, 
-    1, 1, 
-    2, 2, 
-    3, 3, 
-    4, 4, 
+    0, 0,
+    1, 1,
+    2, 2,
+    3, 3,
+    4, 4,
     5, 5
   };
 
   auto target_strings = strings_column_wrapper{
-    "zero",  "zero", 
-    "one",   "one", 
-    "two",   "two", 
-    "three", "three", 
-    "four",  "four", 
+    "zero",  "zero",
+    "one",   "one",
+    "two",   "two",
+    "three", "three",
+    "four",  "four",
     "five",  "five"
   };
   // clang-format on
@@ -742,13 +740,13 @@ TYPED_TEST(TypedScatterListsTest, ListsOfNullStructs)
   // clang-format off
   auto expected_numerics = numerics_column{
     {
-      8, 8, 8, 
-      1, 1, 
-      9, 9, 9, 9, 
-      3, 3, 
-      4, 4, 
+      8, 8, 8,
+      1, 1,
+      9, 9, 9, 9,
+      3, 3,
+      4, 4,
       5, 5
-    }, 
+    },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i != 6) && (i != 8); })
   };
 
@@ -785,17 +783,17 @@ TYPED_TEST(TypedScatterListsTest, EmptyListsOfStructs)
   // clang-format off
   auto source_numerics = numerics_column{
     {
-      9, 9, 9, 9, 
+      9, 9, 9, 9,
       8, 8, 8
-    }, 
+    },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; })
   };
 
   auto source_strings = strings_column_wrapper{
     {
-      "nine",  "nine",  "nine", "nine", 
+      "nine",  "nine",  "nine", "nine",
       "eight", "eight", "eight"
-    }, 
+    },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })
   };
   // clang-format on
@@ -809,20 +807,20 @@ TYPED_TEST(TypedScatterListsTest, EmptyListsOfStructs)
 
   // clang-format off
   auto target_ints    = numerics_column{
-    0, 0, 
-    1, 1, 
-    2, 2, 
-    3, 3, 
-    4, 4, 
+    0, 0,
+    1, 1,
+    2, 2,
+    3, 3,
+    4, 4,
     5, 5
   };
 
   auto target_strings = strings_column_wrapper{
-    "zero",  "zero", 
-    "one",   "one", 
-    "two",   "two", 
-    "three", "three", 
-    "four",  "four", 
+    "zero",  "zero",
+    "one",   "one",
+    "two",   "two",
+    "three", "three",
+    "four",  "four",
     "five",  "five"
   };
   // clang-format on
@@ -841,10 +839,10 @@ TYPED_TEST(TypedScatterListsTest, EmptyListsOfStructs)
   // clang-format off
   auto expected_numerics = numerics_column{
     {
-      8, 8, 8, 
-      1, 1, 
-      9, 9, 9, 9, 
-      3, 3, 
+      8, 8, 8,
+      1, 1,
+      9, 9, 9, 9,
+      3, 3,
       5, 5
     },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i != 6) && (i != 8); })
@@ -882,17 +880,17 @@ TYPED_TEST(TypedScatterListsTest, NullListsOfStructs)
   // clang-format off
   auto source_numerics = numerics_column{
     {
-      9, 9, 9, 9, 
+      9, 9, 9, 9,
       8, 8, 8
-    }, 
+    },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 3; })
   };
 
   auto source_strings = strings_column_wrapper{
     {
-      "nine",  "nine",  "nine", "nine", 
+      "nine",  "nine",  "nine", "nine",
       "eight", "eight", "eight"
-    }, 
+    },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 5; })
   };
   // clang-format on
@@ -913,19 +911,19 @@ TYPED_TEST(TypedScatterListsTest, NullListsOfStructs)
 
   // clang-format off
   auto target_ints    = numerics_column{
-    0, 0, 
-    1, 1, 
-    2, 2, 
-    3, 3, 
-    4, 4, 
+    0, 0,
+    1, 1,
+    2, 2,
+    3, 3,
+    4, 4,
     5, 5
   };
   auto target_strings = strings_column_wrapper{
-    "zero",  "zero", 
-    "one",   "one", 
-    "two",   "two", 
-    "three", "three", 
-    "four",  "four", 
+    "zero",  "zero",
+    "one",   "one",
+    "two",   "two",
+    "three", "three",
+    "four",  "four",
     "five",  "five"
   };
   // clang-format on
@@ -944,10 +942,10 @@ TYPED_TEST(TypedScatterListsTest, NullListsOfStructs)
   // clang-format off
   auto expected_numerics = numerics_column{
     {
-      8, 8, 8, 
-      1, 1, 
-      9, 9, 9, 9, 
-      3, 3, 
+      8, 8, 8,
+      1, 1,
+      9, 9, 9, 9,
+      3, 3,
       5, 5
     },
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i != 6) && (i != 8); })
diff --git a/cpp/tests/copying/scatter_struct_scalar_tests.cpp b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
index 44e65110f33..62201224893 100644
--- a/cpp/tests/copying/scatter_struct_scalar_tests.cpp
+++ b/cpp/tests/copying/scatter_struct_scalar_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -41,7 +41,7 @@ TYPED_TEST_SUITE(TypedStructScalarScatterTest, FixedWidthTypes);
 
 column scatter_single_scalar(scalar const& slr, column_view scatter_map, column_view target)
 {
-  auto result = scatter({slr}, scatter_map, table_view{{target}}, false);
+  auto result = scatter({slr}, scatter_map, table_view{{target}});
   return result->get_column(0);
 }
 
diff --git a/cpp/tests/copying/scatter_tests.cpp b/cpp/tests/copying/scatter_tests.cpp
index 306ab8a3d5c..f853920e24c 100644
--- a/cpp/tests/copying/scatter_tests.cpp
+++ b/cpp/tests/copying/scatter_tests.cpp
@@ -39,7 +39,7 @@ TEST_F(ScatterUntypedTests, ScatterMapTooLong)
   auto const source_table = cudf::table_view({source, source});
   auto const target_table = cudf::table_view({target, target});
 
-  EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table, true), cudf::logic_error);
+  EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), cudf::logic_error);
 }
 
 // Throw logic error if scatter map has nulls
@@ -54,7 +54,7 @@ TEST_F(ScatterUntypedTests, ScatterMapNulls)
   auto const source_table = cudf::table_view({source, source});
   auto const target_table = cudf::table_view({target, target});
 
-  EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table, true), cudf::logic_error);
+  EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), cudf::logic_error);
 }
 
 // Throw logic error if scatter map has nulls
@@ -72,7 +72,7 @@ TEST_F(ScatterUntypedTests, ScatterScalarMapNulls)
 
   auto const target_table = cudf::table_view({target});
 
-  EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table, true), cudf::logic_error);
+  EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table), cudf::logic_error);
 }
 
 // Throw logic error if source and target have different number of columns
@@ -87,7 +87,7 @@ TEST_F(ScatterUntypedTests, ScatterColumnNumberMismatch)
   auto const source_table = cudf::table_view({source});
   auto const target_table = cudf::table_view({target, target});
 
-  EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table, true), cudf::logic_error);
+  EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), cudf::logic_error);
 }
 
 // Throw logic error if number of scalars doesn't match number of columns
@@ -105,7 +105,7 @@ TEST_F(ScatterUntypedTests, ScatterScalarColumnNumberMismatch)
 
   auto const target_table = cudf::table_view({target, target});
 
-  EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table, true), cudf::logic_error);
+  EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table), cudf::logic_error);
 }
 
 // Throw logic error if source and target have different data types
@@ -120,7 +120,7 @@ TEST_F(ScatterUntypedTests, ScatterDataTypeMismatch)
   auto const source_table = cudf::table_view({source});
   auto const target_table = cudf::table_view({target});
 
-  EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table, true), cudf::logic_error);
+  EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), cudf::logic_error);
 }
 
 // Throw logic error if source and target have different data types
@@ -138,7 +138,7 @@ TEST_F(ScatterUntypedTests, ScatterScalarDataTypeMismatch)
 
   auto const target_table = cudf::table_view({target});
 
-  EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table, true), cudf::logic_error);
+  EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table), cudf::logic_error);
 }
 
 template <typename T>
@@ -148,43 +148,6 @@ class ScatterIndexTypeTests : public cudf::test::BaseFixture {
 using IndexTypes = cudf::test::Types<int8_t, int16_t, int32_t, int64_t>;
 TYPED_TEST_SUITE(ScatterIndexTypeTests, IndexTypes);
 
-// Throw logic error if check_bounds is set and index is out of bounds
-TYPED_TEST(ScatterIndexTypeTests, ScatterOutOfBounds)
-{
-  using cudf::test::fixed_width_column_wrapper;
-
-  fixed_width_column_wrapper<TypeParam> source({1, 2, 3, 4, 5, 6});
-  fixed_width_column_wrapper<TypeParam> target({10, 20, 30, 40, 50, 60, 70, 80});
-  fixed_width_column_wrapper<TypeParam> upper_bound({-3, 3, 1, 8});
-  fixed_width_column_wrapper<TypeParam> lower_bound({-3, 3, 1, -9});
-
-  auto const source_table = cudf::table_view({source, source});
-  auto const target_table = cudf::table_view({target, target});
-
-  EXPECT_THROW(cudf::scatter(source_table, upper_bound, target_table, true), cudf::logic_error);
-  EXPECT_THROW(cudf::scatter(source_table, lower_bound, target_table, true), cudf::logic_error);
-}
-
-// Throw logic error if check_bounds is set and index is out of bounds
-TYPED_TEST(ScatterIndexTypeTests, ScatterScalarOutOfBounds)
-{
-  using cudf::scalar_type_t;
-  using cudf::test::fixed_width_column_wrapper;
-
-  auto const source = scalar_type_t<TypeParam>(100, true);
-  std::reference_wrapper<const cudf::scalar> slr_ref{source};
-  std::vector<std::reference_wrapper<const cudf::scalar>> source_vector{slr_ref};
-
-  fixed_width_column_wrapper<TypeParam> target({10, 20, 30, 40, 50, 60, 70, 80});
-  fixed_width_column_wrapper<TypeParam> upper_bound({-3, 3, 1, 8});
-  fixed_width_column_wrapper<TypeParam> lower_bound({-3, 3, 1, -9});
-
-  auto const target_table = cudf::table_view({target});
-
-  EXPECT_THROW(cudf::scatter(source_vector, upper_bound, target_table, true), cudf::logic_error);
-  EXPECT_THROW(cudf::scatter(source_vector, lower_bound, target_table, true), cudf::logic_error);
-}
-
 // Validate that each of the index types work
 TYPED_TEST(ScatterIndexTypeTests, ScatterIndexType)
 {
@@ -199,7 +162,7 @@ TYPED_TEST(ScatterIndexTypeTests, ScatterIndexType)
   auto const target_table   = cudf::table_view({target, target});
   auto const expected_table = cudf::table_view({expected, expected});
 
-  auto const result = cudf::scatter(source_table, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_table, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -221,7 +184,7 @@ TYPED_TEST(ScatterIndexTypeTests, ScatterScalarIndexType)
   auto const target_table   = cudf::table_view({target});
   auto const expected_table = cudf::table_view({expected});
 
-  auto const result = cudf::scatter(source_vector, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_vector, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -248,7 +211,7 @@ TYPED_TEST(ScatterInvalidIndexTypeTests, ScatterInvalidIndexType)
   auto const source_table = cudf::table_view({source, source});
   auto const target_table = cudf::table_view({target, target});
 
-  EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table, true), cudf::logic_error);
+  EXPECT_THROW(cudf::scatter(source_table, scatter_map, target_table), cudf::logic_error);
 }
 
 // Throw logic error if scatter map column has invalid data type
@@ -266,7 +229,7 @@ TYPED_TEST(ScatterInvalidIndexTypeTests, ScatterScalarInvalidIndexType)
 
   auto const target_table = cudf::table_view({target});
 
-  EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table, true), cudf::logic_error);
+  EXPECT_THROW(cudf::scatter(source_vector, scatter_map, target_table), cudf::logic_error);
 }
 
 template <typename T>
@@ -287,7 +250,7 @@ TYPED_TEST(ScatterDataTypeTests, EmptyScatterMap)
   auto const source_table = cudf::table_view({source, source});
   auto const target_table = cudf::table_view({target, target});
 
-  auto const result = cudf::scatter(source_table, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_table, scatter_map, target_table);
 
   // Expect a copy of the input table
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), target_table);
@@ -309,7 +272,7 @@ TYPED_TEST(ScatterDataTypeTests, EmptyScalarScatterMap)
 
   auto const target_table = cudf::table_view({target});
 
-  auto const result = cudf::scatter(source_vector, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_vector, scatter_map, target_table);
 
   // Expect a copy of the input table
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), target_table);
@@ -328,7 +291,7 @@ TYPED_TEST(ScatterDataTypeTests, ScatterNoNulls)
   auto const target_table   = cudf::table_view({target, target});
   auto const expected_table = cudf::table_view({expected, expected});
 
-  auto const result = cudf::scatter(source_table, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_table, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -348,7 +311,7 @@ TYPED_TEST(ScatterDataTypeTests, ScatterBothNulls)
   auto const target_table   = cudf::table_view({target, target});
   auto const expected_table = cudf::table_view({expected, expected});
 
-  auto const result = cudf::scatter(source_table, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_table, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -367,7 +330,7 @@ TYPED_TEST(ScatterDataTypeTests, ScatterSourceNulls)
   auto const target_table   = cudf::table_view({target, target});
   auto const expected_table = cudf::table_view({expected, expected});
 
-  auto const result = cudf::scatter(source_table, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_table, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -387,7 +350,7 @@ TYPED_TEST(ScatterDataTypeTests, ScatterTargetNulls)
   auto const target_table   = cudf::table_view({target, target});
   auto const expected_table = cudf::table_view({expected, expected});
 
-  auto const result = cudf::scatter(source_table, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_table, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -409,7 +372,7 @@ TYPED_TEST(ScatterDataTypeTests, ScatterScalarNoNulls)
   auto const target_table   = cudf::table_view({target});
   auto const expected_table = cudf::table_view({expected});
 
-  auto const result = cudf::scatter(source_vector, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_vector, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -433,7 +396,7 @@ TYPED_TEST(ScatterDataTypeTests, ScatterScalarTargetNulls)
   auto const target_table   = cudf::table_view({target});
   auto const expected_table = cudf::table_view({expected});
 
-  auto const result = cudf::scatter(source_vector, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_vector, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -457,7 +420,7 @@ TYPED_TEST(ScatterDataTypeTests, ScatterScalarSourceNulls)
   auto const target_table   = cudf::table_view({target});
   auto const expected_table = cudf::table_view({expected});
 
-  auto const result = cudf::scatter(source_vector, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_vector, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -482,7 +445,7 @@ TYPED_TEST(ScatterDataTypeTests, ScatterScalarBothNulls)
   auto const target_table   = cudf::table_view({target});
   auto const expected_table = cudf::table_view({expected});
 
-  auto const result = cudf::scatter(source_vector, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_vector, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -510,7 +473,7 @@ TYPED_TEST(ScatterDataTypeTests, ScatterSourceNullsLarge)
   auto const target_table   = cudf::table_view({target, target});
   auto const expected_table = cudf::table_view({expected, expected});
 
-  auto const result = cudf::scatter(source_table, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_table, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -540,7 +503,7 @@ TEST_F(ScatterStringsTests, ScatterNoNulls)
   auto const target_table   = cudf::table_view({target, target});
   auto const expected_table = cudf::table_view({expected, expected});
 
-  auto const result = cudf::scatter(source_table, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_table, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -568,7 +531,7 @@ TEST_F(ScatterStringsTests, ScatterScalarNoNulls)
   auto const target_table   = cudf::table_view({target});
   auto const expected_table = cudf::table_view({expected});
 
-  auto const result = cudf::scatter(source_vector, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_vector, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(result->view(), expected_table);
 }
@@ -937,7 +900,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointScatter)
   auto const target_table   = cudf::table_view({target, target});
   auto const expected_table = cudf::table_view({expected, expected});
 
-  auto const result = cudf::scatter(source_table, scatter_map, target_table, true);
+  auto const result = cudf::scatter(source_table, scatter_map, target_table);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_table, result->view());
 }
diff --git a/cpp/tests/copying/segmented_gather_list_tests.cpp b/cpp/tests/copying/segmented_gather_list_tests.cpp
index e3a003c51d1..3ba7f668595 100644
--- a/cpp/tests/copying/segmented_gather_list_tests.cpp
+++ b/cpp/tests/copying/segmented_gather_list_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -159,11 +159,11 @@ TYPED_TEST(SegmentedGatherTest, GatherNested)
                                    {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {-17, -18}}};
     auto const gather_map = LCW<int>{{0, -2, -2}, {1}, {1, 0, -1, -5}};
     auto const results    = segmented_gather(lists_column_view{list}, lists_column_view{gather_map});
-    auto const expected   = LCW<T>{{{2, 3}, {2, 3}, {2, 3}}, 
-                                   {{9, 10, 11}}, 
+    auto const expected   = LCW<T>{{{2, 3}, {2, 3}, {2, 3}},
+                                   {{9, 10, 11}},
                                    {{17, 18}, {15, 16}, {-17, -18}, {15, 16}}};
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
-    // clang-format on 
+    // clang-format on
   }
 
   // List<List<T>>, with out-of-bounds gather indices.
@@ -286,7 +286,7 @@ TYPED_TEST(SegmentedGatherTest, GatherNegatives)
                                    {{6, 7, 8}, {9, 10, 11}, {12, 13, 14}},
                                    {{15, 16}, {17, 18}, {17, 18}, {17, 18}, {17, 18}}};
     auto const gather_map = LCW<int>{{-1, 0}, {-2, -1, -4}, {-6, -4, -3, -2, -1, 0}};
-    auto const results    = 
+    auto const results    =
       segmented_gather(lists_column_view{list}, lists_column_view{gather_map}, NULLIFY);
     auto const expected   = LCW<T>{{{4, 5}, {2, 3}},
                                    {{{9, 10, 11}, {12, 13, 14}, LCW<T>{}}, null_at(2)},
@@ -306,7 +306,8 @@ TYPED_TEST(SegmentedGatherTest, GatherOnNonCompactedNullLists)
   auto const input = list.release();
 
   // Set non-empty list row at index 5 to null.
-  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 5, 6, false);
+  cudf::detail::set_null_mask(
+    input->mutable_view().null_mask(), 5, 6, false, cudf::get_default_stream());
 
   auto const gather_map = LCW<int>{{-1, 2, 1, -4}, {0}, {-2, 1}, {0, 2, 1}, {}, {0}, {1, 2}};
   auto const expected =
@@ -575,26 +576,31 @@ TEST_F(SegmentedGatherTestFloat, Fails)
   cudf::test::strings_column_wrapper nonlist_map1{"1", "2", "0", "1"};
   LCW<cudf::string_view> nonlist_map2{{"1", "2", "0", "1"}};
 
-  CUDF_EXPECT_THROW_MESSAGE(
+  // Input must be a list of integer indices. It should fail for integers,
+  // strings, or lists containing anything other than integers.
+  EXPECT_THROW(
     cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{nonlist_map0}),
-    "lists_column_view only supports lists");
+    cudf::logic_error);
 
-  CUDF_EXPECT_THROW_MESSAGE(
+  EXPECT_THROW(
     cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{nonlist_map1}),
-    "lists_column_view only supports lists");
+    cudf::logic_error);
 
-  CUDF_EXPECT_THROW_MESSAGE(
+  EXPECT_THROW(
     cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{nonlist_map2}),
-    "Gather map should be list column of index type");
+    cudf::logic_error);
 
   auto valids =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
   LCW<int8_t> nulls_map{{{3, 2, 1, 0}, {0}, {0}, {0, 1}}, valids};
-  CUDF_EXPECT_THROW_MESSAGE(
+
+  // Nulls are not supported in the gather map.
+  EXPECT_THROW(
     cudf::lists::detail::segmented_gather(lists_column_view{list}, lists_column_view{nulls_map}),
-    "Gather map contains nulls");
+    cudf::logic_error);
 
-  CUDF_EXPECT_THROW_MESSAGE(cudf::lists::detail::segmented_gather(
-                              lists_column_view{list}, lists_column_view{size_mismatch_map}),
-                            "Gather map and list column should be same size");
+  // Gather map and list column sizes must be the same.
+  EXPECT_THROW(cudf::lists::detail::segmented_gather(lists_column_view{list},
+                                                     lists_column_view{size_mismatch_map}),
+               cudf::logic_error);
 }
diff --git a/cpp/tests/copying/shift_tests.cpp b/cpp/tests/copying/shift_tests.cpp
index 3907afd10c0..288e1d3fec6 100644
--- a/cpp/tests/copying/shift_tests.cpp
+++ b/cpp/tests/copying/shift_tests.cpp
@@ -35,7 +35,7 @@ using TestTypes = cudf::test::Types<int32_t>;
 
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(cudf::test::make_type_param_scalar<T>(0), false, stream, mr);
@@ -45,7 +45,7 @@ std::unique_ptr<cudf::scalar> make_scalar(
 template <typename T, typename ScalarType = cudf::scalar_type_t<T>>
 std::unique_ptr<cudf::scalar> make_scalar(
   T value,
-  rmm::cuda_stream_view stream        = cudf::default_stream_value,
+  rmm::cuda_stream_view stream        = cudf::get_default_stream(),
   rmm::mr::device_memory_resource* mr = rmm::mr::get_current_device_resource())
 {
   auto s = new ScalarType(value, true, stream, mr);
diff --git a/cpp/tests/datetime/datetime_ops_test.cpp b/cpp/tests/datetime/datetime_ops_test.cpp
index 2898a649e36..c6d36b2aa6e 100644
--- a/cpp/tests/datetime/datetime_ops_test.cpp
+++ b/cpp/tests/datetime/datetime_ops_test.cpp
@@ -60,6 +60,9 @@ TYPED_TEST(NonTimestampTest, TestThrowsOnNonTimestamp)
   EXPECT_THROW(extract_hour(col), cudf::logic_error);
   EXPECT_THROW(extract_minute(col), cudf::logic_error);
   EXPECT_THROW(extract_second(col), cudf::logic_error);
+  EXPECT_THROW(extract_millisecond_fraction(col), cudf::logic_error);
+  EXPECT_THROW(extract_microsecond_fraction(col), cudf::logic_error);
+  EXPECT_THROW(extract_nanosecond_fraction(col), cudf::logic_error);
   EXPECT_THROW(last_day_of_month(col), cudf::logic_error);
   EXPECT_THROW(day_of_year(col), cudf::logic_error);
   EXPECT_THROW(add_calendrical_months(
@@ -97,12 +100,21 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents)
       1674631932929   // 2023-01-25 07:32:12.929 GMT
     };
 
+  auto timestamps_ns =
+    cudf::test::fixed_width_column_wrapper<cudf::timestamp_ns, cudf::timestamp_ns::rep>{
+      -23324234,  // 1969-12-31 23:59:59.976675766 GMT
+      23432424,   // 1970-01-01 00:00:00.023432424 GMT
+      987234623   // 1970-01-01 00:00:00.987234623 GMT
+    };
+
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_D),
                                  fixed_width_column_wrapper<int16_t>{1965, 2018, 2023});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_s),
                                  fixed_width_column_wrapper<int16_t>{1965, 2018, 2023});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_ms),
                                  fixed_width_column_wrapper<int16_t>{1965, 2018, 2023});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_year(timestamps_ns),
+                                 fixed_width_column_wrapper<int16_t>{1969, 1970, 1970});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_D),
                                  fixed_width_column_wrapper<int16_t>{10, 7, 1});
@@ -110,6 +122,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents)
                                  fixed_width_column_wrapper<int16_t>{10, 7, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_ms),
                                  fixed_width_column_wrapper<int16_t>{10, 7, 1});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_month(timestamps_ns),
+                                 fixed_width_column_wrapper<int16_t>{12, 1, 1});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_D),
                                  fixed_width_column_wrapper<int16_t>{26, 4, 25});
@@ -117,6 +131,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents)
                                  fixed_width_column_wrapper<int16_t>{26, 4, 25});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ms),
                                  fixed_width_column_wrapper<int16_t>{26, 4, 25});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_day(timestamps_ns),
+                                 fixed_width_column_wrapper<int16_t>{31, 1, 1});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_D),
                                  fixed_width_column_wrapper<int16_t>{2, 3, 3});
@@ -124,6 +140,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents)
                                  fixed_width_column_wrapper<int16_t>{2, 3, 3});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_ms),
                                  fixed_width_column_wrapper<int16_t>{2, 3, 3});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_weekday(timestamps_ms),
+                                 fixed_width_column_wrapper<int16_t>{2, 3, 3});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_D),
                                  fixed_width_column_wrapper<int16_t>{0, 0, 0});
@@ -131,6 +149,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents)
                                  fixed_width_column_wrapper<int16_t>{14, 12, 7});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_ms),
                                  fixed_width_column_wrapper<int16_t>{14, 12, 7});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps_ns),
+                                 fixed_width_column_wrapper<int16_t>{23, 0, 0});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_D),
                                  fixed_width_column_wrapper<int16_t>{0, 0, 0});
@@ -138,6 +158,8 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents)
                                  fixed_width_column_wrapper<int16_t>{1, 0, 32});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ms),
                                  fixed_width_column_wrapper<int16_t>{1, 0, 32});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns),
+                                 fixed_width_column_wrapper<int16_t>{59, 0, 0});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_D),
                                  fixed_width_column_wrapper<int16_t>{0, 0, 0});
@@ -145,6 +167,35 @@ TEST_F(BasicDatetimeOpsTest, TestExtractingDatetimeComponents)
                                  fixed_width_column_wrapper<int16_t>{12, 0, 12});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps_ms),
                                  fixed_width_column_wrapper<int16_t>{12, 0, 12});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps_ns),
+                                 fixed_width_column_wrapper<int16_t>{59, 0, 0});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_D),
+                                 fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_s),
+                                 fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_ms),
+                                 fixed_width_column_wrapper<int16_t>{762, 0, 929});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps_ns),
+                                 fixed_width_column_wrapper<int16_t>{976, 23, 987});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_D),
+                                 fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_s),
+                                 fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_ms),
+                                 fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps_ns),
+                                 fixed_width_column_wrapper<int16_t>{675, 432, 234});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_D),
+                                 fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_s),
+                                 fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ms),
+                                 fixed_width_column_wrapper<int16_t>{0, 0, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps_ns),
+                                 fixed_width_column_wrapper<int16_t>{766, 424, 623});
 }
 
 template <typename T>
@@ -175,6 +226,9 @@ TYPED_TEST(TypedDatetimeOpsTest, TestEmptyColumns)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_hour(timestamps), int16s);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_minute(timestamps), int16s);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_second(timestamps), int16s);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_millisecond_fraction(timestamps), int16s);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_microsecond_fraction(timestamps), int16s);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*extract_nanosecond_fraction(timestamps), int16s);
 }
 
 TYPED_TEST(TypedDatetimeOpsTest, TestExtractingGeneratedDatetimeComponents)
diff --git a/cpp/tests/device_atomics/device_atomics_test.cu b/cpp/tests/device_atomics/device_atomics_test.cu
index 17e67da6227..43874b84114 100644
--- a/cpp/tests/device_atomics/device_atomics_test.cu
+++ b/cpp/tests/device_atomics/device_atomics_test.cu
@@ -141,22 +141,23 @@ struct AtomicsTest : public cudf::test::BaseFixture {
     result_init[4] = result_init[1];
     result_init[5] = result_init[2];
 
-    auto dev_data   = cudf::detail::make_device_uvector_sync(v);
-    auto dev_result = cudf::detail::make_device_uvector_sync(result_init);
+    auto dev_data = cudf::detail::make_device_uvector_sync(v, cudf::get_default_stream());
+    auto dev_result =
+      cudf::detail::make_device_uvector_sync(result_init, cudf::get_default_stream());
 
     if (block_size == 0) { block_size = vec_size; }
 
     if (is_cas_test) {
-      gpu_atomicCAS_test<<<grid_size, block_size, 0, cudf::default_stream_value.value()>>>(
+      gpu_atomicCAS_test<<<grid_size, block_size, 0, cudf::get_default_stream().value()>>>(
         dev_result.data(), dev_data.data(), vec_size);
     } else {
-      gpu_atomic_test<<<grid_size, block_size, 0, cudf::default_stream_value.value()>>>(
+      gpu_atomic_test<<<grid_size, block_size, 0, cudf::get_default_stream().value()>>>(
         dev_result.data(), dev_data.data(), vec_size);
     }
 
-    auto host_result = cudf::detail::make_host_vector_sync(dev_result);
+    auto host_result = cudf::detail::make_host_vector_sync(dev_result, cudf::get_default_stream());
 
-    CUDF_CHECK_CUDA(cudf::default_stream_value.value());
+    CUDF_CHECK_CUDA(cudf::get_default_stream().value());
 
     if (!is_timestamp_sum<T, cudf::DeviceSum>()) {
       EXPECT_EQ(host_result[0], exact[0]) << "atomicAdd test failed";
@@ -293,17 +294,17 @@ struct AtomicsBitwiseOpTest : public cudf::test::BaseFixture {
     exact[2] = std::accumulate(
       v.begin(), v.end(), identity[2], [](T acc, uint64_t i) { return acc ^ T(i); });
 
-    auto dev_result = cudf::detail::make_device_uvector_sync(identity);
-    auto dev_data   = cudf::detail::make_device_uvector_sync(v);
+    auto dev_result = cudf::detail::make_device_uvector_sync(identity, cudf::get_default_stream());
+    auto dev_data   = cudf::detail::make_device_uvector_sync(v, cudf::get_default_stream());
 
     if (block_size == 0) { block_size = vec_size; }
 
-    gpu_atomic_bitwiseOp_test<T><<<grid_size, block_size, 0, cudf::default_stream_value.value()>>>(
+    gpu_atomic_bitwiseOp_test<T><<<grid_size, block_size, 0, cudf::get_default_stream().value()>>>(
       reinterpret_cast<T*>(dev_result.data()), reinterpret_cast<T*>(dev_data.data()), vec_size);
 
-    auto host_result = cudf::detail::make_host_vector_sync(dev_result);
+    auto host_result = cudf::detail::make_host_vector_sync(dev_result, cudf::get_default_stream());
 
-    CUDF_CHECK_CUDA(cudf::default_stream_value.value());
+    CUDF_CHECK_CUDA(cudf::get_default_stream().value());
 
     // print_exact(exact, "exact");
     // print_exact(host_result.data(), "result");
diff --git a/cpp/tests/dictionary/search_test.cpp b/cpp/tests/dictionary/search_test.cpp
index 53eb429f4b6..11cafa7dd8e 100644
--- a/cpp/tests/dictionary/search_test.cpp
+++ b/cpp/tests/dictionary/search_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -35,7 +35,10 @@ TEST_F(DictionarySearchTest, StringsColumn)
 
   result = cudf::dictionary::get_index(dictionary, cudf::string_scalar("eee"));
   EXPECT_FALSE(result->is_valid());
-  result   = cudf::dictionary::detail::get_insert_index(dictionary, cudf::string_scalar("eee"));
+  result   = cudf::dictionary::detail::get_insert_index(dictionary,
+                                                      cudf::string_scalar("eee"),
+                                                      cudf::get_default_stream(),
+                                                      rmm::mr::get_current_device_resource());
   n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
   EXPECT_EQ(uint32_t{5}, n_result->value());
 }
@@ -51,7 +54,10 @@ TEST_F(DictionarySearchTest, WithNulls)
 
   result = cudf::dictionary::get_index(dictionary, cudf::numeric_scalar<int64_t>(5));
   EXPECT_FALSE(result->is_valid());
-  result = cudf::dictionary::detail::get_insert_index(dictionary, cudf::numeric_scalar<int64_t>(5));
+  result   = cudf::dictionary::detail::get_insert_index(dictionary,
+                                                      cudf::numeric_scalar<int64_t>(5),
+                                                      cudf::get_default_stream(),
+                                                      rmm::mr::get_current_device_resource());
   n_result = dynamic_cast<cudf::numeric_scalar<uint32_t>*>(result.get());
   EXPECT_EQ(uint32_t{1}, n_result->value());
 }
@@ -62,7 +68,8 @@ TEST_F(DictionarySearchTest, EmptyColumn)
   cudf::numeric_scalar<int64_t> key(7);
   auto result = cudf::dictionary::get_index(dictionary, key);
   EXPECT_FALSE(result->is_valid());
-  result = cudf::dictionary::detail::get_insert_index(dictionary, key);
+  result = cudf::dictionary::detail::get_insert_index(
+    dictionary, key, cudf::get_default_stream(), rmm::mr::get_current_device_resource());
   EXPECT_FALSE(result->is_valid());
 }
 
@@ -71,5 +78,8 @@ TEST_F(DictionarySearchTest, Errors)
   cudf::test::dictionary_column_wrapper<int64_t> dictionary({1, 2, 3});
   cudf::numeric_scalar<double> key(7);
   EXPECT_THROW(cudf::dictionary::get_index(dictionary, key), cudf::logic_error);
-  EXPECT_THROW(cudf::dictionary::detail::get_insert_index(dictionary, key), cudf::logic_error);
+  EXPECT_THROW(
+    cudf::dictionary::detail::get_insert_index(
+      dictionary, key, cudf::get_default_stream(), rmm::mr::get_current_device_resource()),
+    cudf::logic_error);
 }
diff --git a/cpp/tests/error/error_handling_test.cu b/cpp/tests/error/error_handling_test.cu
index e34cf23eee4..eb4a3e895f9 100644
--- a/cpp/tests/error/error_handling_test.cu
+++ b/cpp/tests/error/error_handling_test.cu
@@ -29,25 +29,10 @@ TEST(ExpectsTest, FalseCondition)
 
 TEST(ExpectsTest, TrueCondition) { EXPECT_NO_THROW(CUDF_EXPECTS(true, "condition is true")); }
 
-TEST(ExpectsTest, TryCatch)
-{
-  CUDF_EXPECT_THROW_MESSAGE(CUDF_EXPECTS(false, "test reason"), "test reason");
-}
-
-TEST(CudaTryTest, Error)
-{
-  CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorLaunchFailure),
-                            "cudaErrorLaunchFailure unspecified launch failure");
-}
+TEST(CudaTryTest, Error) { EXPECT_THROW(CUDF_CUDA_TRY(cudaErrorLaunchFailure), cudf::cuda_error); }
 
 TEST(CudaTryTest, Success) { EXPECT_NO_THROW(CUDF_CUDA_TRY(cudaSuccess)); }
 
-TEST(CudaTryTest, TryCatch)
-{
-  CUDA_EXPECT_THROW_MESSAGE(CUDF_CUDA_TRY(cudaErrorMemoryAllocation),
-                            "cudaErrorMemoryAllocation out of memory");
-}
-
 TEST(StreamCheck, success) { EXPECT_NO_THROW(CUDF_CHECK_CUDA(0)); }
 
 namespace {
@@ -79,9 +64,7 @@ TEST(StreamCheck, CatchFailedKernel)
 #ifndef NDEBUG
   stream.synchronize();
 #endif
-  CUDA_EXPECT_THROW_MESSAGE(CUDF_CHECK_CUDA(stream.value()),
-                            "cudaErrorInvalidConfiguration "
-                            "invalid configuration argument");
+  EXPECT_THROW(CUDF_CHECK_CUDA(stream.value()), cudf::cuda_error);
 }
 
 __global__ void kernel() { asm("trap;"); }
@@ -90,7 +73,7 @@ TEST(DeathTest, CudaFatalError)
 {
   testing::FLAGS_gtest_death_test_style = "threadsafe";
   auto call_kernel                      = []() {
-    kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>();
+    kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>();
     try {
       CUDF_CUDA_TRY(cudaDeviceSynchronize());
     } catch (const cudf::fatal_cuda_error& fe) {
@@ -140,5 +123,12 @@ TEST(DebugAssert, cudf_assert_true)
 int main(int argc, char** argv)
 {
   ::testing::InitGoogleTest(&argc, argv);
+  auto const cmd_opts    = parse_cudf_test_opts(argc, argv);
+  auto const stream_mode = cmd_opts["stream_mode"].as<std::string>();
+  if (stream_mode == "custom") {
+    auto resource = rmm::mr::get_current_device_resource();
+    auto adapter  = make_stream_checking_resource_adaptor(resource);
+    rmm::mr::set_current_device_resource(&adapter);
+  }
   return RUN_ALL_TESTS();
 }
diff --git a/cpp/tests/filling/fill_tests.cpp b/cpp/tests/filling/fill_tests.cpp
index f305d4a06c7..ec400fa61c8 100644
--- a/cpp/tests/filling/fill_tests.cpp
+++ b/cpp/tests/filling/fill_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -206,7 +206,7 @@ class FillStringTestFixture : public cudf::test::BaseFixture {
         }));
 
     auto p_ret = cudf::fill(destination, begin, end, *p_val);
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*p_ret, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*p_ret, expected);
   }
 };
 
diff --git a/cpp/tests/filling/repeat_tests.cpp b/cpp/tests/filling/repeat_tests.cpp
index 7d30298b1bd..df8dceb0f8d 100644
--- a/cpp/tests/filling/repeat_tests.cpp
+++ b/cpp/tests/filling/repeat_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -272,24 +272,3 @@ TEST_F(RepeatErrorTestFixture, CountHasNulls)
   // input_table.has_nulls() == true
   EXPECT_THROW(auto ret = cudf::repeat(input_table, count), cudf::logic_error);
 }
-
-TEST_F(RepeatErrorTestFixture, NegativeCountOrOverflow)
-{
-  auto input = cudf::test::fixed_width_column_wrapper<int32_t>(
-    thrust::make_counting_iterator(0), thrust::make_counting_iterator(0) + 100);
-
-  auto count_neg = cudf::test::fixed_width_column_wrapper<cudf::size_type>(
-    thrust::make_constant_iterator(-1, 0), thrust::make_constant_iterator(-1, 100));
-
-  auto value          = std::numeric_limits<cudf::size_type>::max() / 10;
-  auto count_overflow = cudf::test::fixed_width_column_wrapper<cudf::size_type>(
-    thrust::make_constant_iterator(value, 0), thrust::make_constant_iterator(value, 100));
-
-  cudf::table_view input_table{{input}};
-
-  // negative
-  EXPECT_THROW(auto p_ret = cudf::repeat(input_table, count_neg, true), cudf::logic_error);
-
-  // overflow
-  EXPECT_THROW(auto p_ret = cudf::repeat(input_table, count_overflow, true), cudf::logic_error);
-}
diff --git a/cpp/tests/fixed_point/fixed_point_tests.cu b/cpp/tests/fixed_point/fixed_point_tests.cu
index a4e0736e22f..ab9970dc370 100644
--- a/cpp/tests/fixed_point/fixed_point_tests.cu
+++ b/cpp/tests/fixed_point/fixed_point_tests.cu
@@ -83,9 +83,9 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
   using decimal32 = fixed_point<int32_t, Radix::BASE_10>;
 
   std::vector<decimal32> vec1(1000, decimal32{1, scale_type{-2}});
-  auto d_vec1 = cudf::detail::make_device_uvector_sync(vec1);
+  auto d_vec1 = cudf::detail::make_device_uvector_sync(vec1, cudf::get_default_stream());
 
-  auto const sum = thrust::reduce(rmm::exec_policy(cudf::default_stream_value),
+  auto const sum = thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
                                   std::cbegin(d_vec1),
                                   std::cend(d_vec1),
                                   decimal32{0, scale_type{-2}});
@@ -96,12 +96,12 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
   //       change inclusive scan to run on device (avoid copying to host)
   thrust::inclusive_scan(std::cbegin(vec1), std::cend(vec1), std::begin(vec1));
 
-  d_vec1 = cudf::detail::make_device_uvector_sync(vec1);
+  d_vec1 = cudf::detail::make_device_uvector_sync(vec1, cudf::get_default_stream());
 
   std::vector<int32_t> vec2(1000);
   std::iota(std::begin(vec2), std::end(vec2), 1);
 
-  auto const res1 = thrust::reduce(rmm::exec_policy(cudf::default_stream_value),
+  auto const res1 = thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
                                    std::cbegin(d_vec1),
                                    std::cend(d_vec1),
                                    decimal32{0, scale_type{-2}});
@@ -110,15 +110,15 @@ TEST_F(FixedPointTest, DecimalXXThrustOnDevice)
 
   EXPECT_EQ(static_cast<int32_t>(res1), res2);
 
-  rmm::device_uvector<int32_t> d_vec3(1000, cudf::default_stream_value);
+  rmm::device_uvector<int32_t> d_vec3(1000, cudf::get_default_stream());
 
-  thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     std::cbegin(d_vec1),
                     std::cend(d_vec1),
                     std::begin(d_vec3),
                     cast_to_int32_fn{});
 
-  auto vec3 = cudf::detail::make_std_vector_sync(d_vec3);
+  auto vec3 = cudf::detail::make_std_vector_sync(d_vec3, cudf::get_default_stream());
 
   EXPECT_EQ(vec2, vec3);
 }
diff --git a/cpp/tests/groupby/collect_set_tests.cpp b/cpp/tests/groupby/collect_set_tests.cpp
index cf324cf3a8e..818a4c63a1f 100644
--- a/cpp/tests/groupby/collect_set_tests.cpp
+++ b/cpp/tests/groupby/collect_set_tests.cpp
@@ -248,7 +248,10 @@ TEST_F(CollectSetTest, FloatsWithNaN)
     vals_expected = {{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f},
                       validity_col{true, true, true, true, true, true, true, false}}};
     auto const [out_keys, out_lists] =
-      groupby_collect_set(keys, vals, CollectSetTest::collect_set());
+      groupby_collect_set(keys,
+                          vals,
+                          cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(
+                            null_policy::INCLUDE, null_equality::EQUAL, nan_equality::UNEQUAL));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(keys_expected, *out_keys, verbosity);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(vals_expected, *out_lists, verbosity);
   }
@@ -258,7 +261,10 @@ TEST_F(CollectSetTest, FloatsWithNaN)
     vals_expected = {{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f, 0.0f},
                       validity_col{true, true, true, true, true, true, true, false, false}}};
     auto const [out_keys, out_lists] =
-      groupby_collect_set(keys, vals, CollectSetTest::collect_set_null_unequal());
+      groupby_collect_set(keys,
+                          vals,
+                          cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(
+                            null_policy::INCLUDE, null_equality::UNEQUAL, nan_equality::UNEQUAL));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(keys_expected, *out_keys, verbosity);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(vals_expected, *out_lists, verbosity);
   }
@@ -267,7 +273,10 @@ TEST_F(CollectSetTest, FloatsWithNaN)
   {
     vals_expected = {{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN}};
     auto const [out_keys, out_lists] =
-      groupby_collect_set(keys, vals, CollectSetTest::collect_set_null_exclude());
+      groupby_collect_set(keys,
+                          vals,
+                          cudf::make_collect_set_aggregation<cudf::groupby_aggregation>(
+                            null_policy::EXCLUDE, null_equality::EQUAL, nan_equality::UNEQUAL));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(keys_expected, *out_keys, verbosity);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(vals_expected, *out_lists, verbosity);
   }
diff --git a/cpp/tests/groupby/count_scan_tests.cpp b/cpp/tests/groupby/count_scan_tests.cpp
index 164e967e28e..54df690d307 100644
--- a/cpp/tests/groupby/count_scan_tests.cpp
+++ b/cpp/tests/groupby/count_scan_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -53,9 +53,10 @@ TYPED_TEST(groupby_count_scan_test, basic)
   result_wrapper expect_vals{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
   // clang-format on
 
+  // Count groupby aggregation is only supported with null_policy::EXCLUDE
   auto agg1 = cudf::make_count_aggregation<groupby_scan_aggregation>();
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
-                            "Unsupported groupby scan aggregation");
+  EXPECT_THROW(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
+               cudf::logic_error);
 
   auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
@@ -181,13 +182,13 @@ TYPED_TEST(FixedPointTestAllReps, GroupByCountScan)
   auto const expect_vals = result_wrapper{0, 1, 2, 0, 1, 2, 3, 0, 1, 2};
   // clang-format on
 
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     vals,
-                     expect_keys,
-                     expect_vals,
-                     cudf::make_count_aggregation<groupby_scan_aggregation>()),
-    "Unsupported groupby scan aggregation");
+  // Count groupby aggregation is only supported with null_policy::EXCLUDE
+  EXPECT_THROW(test_single_scan(keys,
+                                vals,
+                                expect_keys,
+                                expect_vals,
+                                cudf::make_count_aggregation<groupby_scan_aggregation>()),
+               cudf::logic_error);
 
   auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>(null_policy::INCLUDE);
   test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg2));
@@ -209,9 +210,10 @@ TEST_F(groupby_dictionary_count_scan_test, basic)
   result_wrapper expect_vals{0, 0, 0, 1, 0, 1};
   // clang-format on
 
+  // Count groupby aggregation is only supported with null_policy::EXCLUDE
   auto agg1 = cudf::make_count_aggregation<groupby_scan_aggregation>();
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
-                            "Unsupported groupby scan aggregation");
+  EXPECT_THROW(test_single_scan(keys, vals, expect_keys, expect_vals, std::move(agg1)),
+               cudf::logic_error);
   test_single_scan(keys,
                    vals,
                    expect_keys,
diff --git a/cpp/tests/groupby/groupby_test_util.hpp b/cpp/tests/groupby/groupby_test_util.hpp
index b333d9dacba..83f522ed913 100644
--- a/cpp/tests/groupby/groupby_test_util.hpp
+++ b/cpp/tests/groupby/groupby_test_util.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -131,57 +131,5 @@ inline void test_single_scan(column_view const& keys,
     expect_vals, *result.second[0].results[0], debug_output_level::ALL_ERRORS);
 }
 
-template <typename T>
-inline T frand()
-{
-  return static_cast<T>(rand()) / static_cast<T>(RAND_MAX);
-}
-
-template <typename T>
-inline T rand_range(T min, T max)
-{
-  return min + static_cast<T>(frand<T>() * (max - min));
-}
-
-inline std::unique_ptr<column> generate_typed_percentile_distribution(
-  std::vector<double> const& buckets,
-  std::vector<int> const& sizes,
-  data_type t,
-  bool sorted = false)
-{
-  srand(0);
-
-  std::vector<double> values;
-  size_t total_size = std::reduce(sizes.begin(), sizes.end(), 0);
-  values.reserve(total_size);
-  for (size_t idx = 0; idx < sizes.size(); idx++) {
-    double min = idx == 0 ? 0.0f : buckets[idx - 1];
-    double max = buckets[idx];
-
-    for (int v_idx = 0; v_idx < sizes[idx]; v_idx++) {
-      values.push_back(rand_range(min, max));
-    }
-  }
-
-  if (sorted) { std::sort(values.begin(), values.end()); }
-
-  cudf::test::fixed_width_column_wrapper<double> src(values.begin(), values.end());
-  return cudf::cast(src, t);
-}
-
-// "standardized" means the parameters sent into generate_typed_percentile_distribution. the intent
-// is to provide a standardized set of inputs for use with tdigest generation tests and
-// percentile_approx tests. std::vector<double>
-// buckets{10.0, 20.0, 30.0, 40.0, 50.0, 60.0, 70.0, 80.0, 90.0, 100.0}; std::vector<int>
-// sizes{50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
-inline std::unique_ptr<column> generate_standardized_percentile_distribution(
-  data_type t = data_type{type_id::FLOAT64}, bool sorted = false)
-{
-  std::vector<double> buckets{10.0f, 20.0f, 30.0f, 40.0f, 50.0f, 60.0f, 70.0f, 80.0, 90.0f, 100.0f};
-  std::vector<int> b_sizes{
-    50000, 50000, 50000, 50000, 50000, 100000, 100000, 100000, 100000, 100000};
-  return generate_typed_percentile_distribution(buckets, b_sizes, t, sorted);
-}
-
 }  // namespace test
 }  // namespace cudf
diff --git a/cpp/tests/groupby/keys_tests.cpp b/cpp/tests/groupby/keys_tests.cpp
index 19e82c4ffd1..efd0f52114e 100644
--- a/cpp/tests/groupby/keys_tests.cpp
+++ b/cpp/tests/groupby/keys_tests.cpp
@@ -234,12 +234,11 @@ TYPED_TEST(groupby_keys_test, mismatch_num_rows)
   fixed_width_column_wrapper<K> keys{1, 2, 3};
   fixed_width_column_wrapper<V> vals{0, 1, 2, 3, 4};
 
+  // Verify that scan throws an error when given data of mismatched sizes.
   auto agg = cudf::make_count_aggregation<groupby_aggregation>();
-  CUDF_EXPECT_THROW_MESSAGE(test_single_agg(keys, vals, keys, vals, std::move(agg)),
-                            "Size mismatch between request values and groupby keys.");
+  EXPECT_THROW(test_single_agg(keys, vals, keys, vals, std::move(agg)), cudf::logic_error);
   auto agg2 = cudf::make_count_aggregation<groupby_scan_aggregation>();
-  CUDF_EXPECT_THROW_MESSAGE(test_single_scan(keys, vals, keys, vals, std::move(agg2)),
-                            "Size mismatch between request values and groupby keys.");
+  EXPECT_THROW(test_single_scan(keys, vals, keys, vals, std::move(agg2)), cudf::logic_error);
 }
 
 template <typename T>
@@ -294,8 +293,7 @@ TYPED_TEST(groupby_keys_test, structs)
   auto expect_vals = FWCW<R>{6, 1, 8, 7};
 
   auto agg = cudf::make_argmax_aggregation<groupby_aggregation>();
-  EXPECT_THROW(test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg)),
-               cudf::logic_error);
+  test_single_agg(keys, vals, expect_keys, expect_vals, std::move(agg));
 }
 
 template <typename T>
@@ -314,8 +312,7 @@ TYPED_TEST(groupby_keys_test, lists)
   // clang-format on
 
   auto agg = cudf::make_sum_aggregation<groupby_aggregation>();
-  EXPECT_THROW(test_single_agg(keys, values, expected_keys, expected_values, std::move(agg)),
-               cudf::logic_error);
+  test_single_agg(keys, values, expected_keys, expected_values, std::move(agg));
 }
 
 struct groupby_string_keys_test : public cudf::test::BaseFixture {
diff --git a/cpp/tests/groupby/lists_tests.cu b/cpp/tests/groupby/lists_tests.cu
index 45c6b8fe2e6..226758fe81a 100644
--- a/cpp/tests/groupby/lists_tests.cu
+++ b/cpp/tests/groupby/lists_tests.cu
@@ -114,7 +114,7 @@ inline void test_hash_based_sum_agg(column_view const& keys,
   // resulting table: `t [num_rows, 2 * num_rows - 1]`
   auto combined_table = cudf::concatenate(std::vector{expected_kv, result_kv});
   auto preprocessed_t = cudf::experimental::row::hash::preprocessed_table::create(
-    combined_table->view(), cudf::default_stream_value);
+    combined_table->view(), cudf::get_default_stream());
   cudf::experimental::row::equality::self_comparator comparator(preprocessed_t);
 
   auto const null_keys_are_equal =
@@ -124,7 +124,7 @@ inline void test_hash_based_sum_agg(column_view const& keys,
 
   // For each row in expected table `t[0, num_rows)`, there must be a match
   // in the resulting table `t[num_rows, 2 * num_rows)`
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                              thrust::make_counting_iterator<cudf::size_type>(0),
                              thrust::make_counting_iterator<cudf::size_type>(num_rows),
                              func));
@@ -149,8 +149,7 @@ void test_sum_agg(column_view const& keys,
                   column_view const& expected_keys,
                   column_view const& expected_values)
 {
-  EXPECT_THROW(test_sort_based_sum_agg(keys, values, expected_keys, expected_values),
-               cudf::logic_error);
+  test_sort_based_sum_agg(keys, values, expected_keys, expected_values);
   test_hash_based_sum_agg(keys, values, expected_keys, expected_values);
 }
 }  // namespace
@@ -189,8 +188,8 @@ TYPED_TEST(groupby_lists_test, lists_with_nulls)
   auto keys   = lcw<TypeParam> { {{1,1}, {2,2}, {3,3}, {1,1}, {2,2}}, nulls_at({1,2,4})};
   auto values = fwcw<int32_t>  {     0,     1,     2,     3,     4 };
 
-  auto expected_keys   = lcw<TypeParam> { {{1,1}, {null,null}}, null_at(1)};
-  auto expected_values = fwcw<R>        {     3,           7 };
+  auto expected_keys   = lcw<TypeParam> { {{null,null}, {1,1}}, null_at(0)};
+  auto expected_values = fwcw<R>        {           7,     3 };
   // clang-format on
 
   test_sum_agg(keys, values, expected_keys, expected_values);
@@ -207,8 +206,8 @@ TYPED_TEST(groupby_lists_test, lists_with_null_elements)
   auto values = fwcw<int32_t>{1, 2, 4, 5};
 
   auto expected_keys = lcw<TypeParam>{
-    {lcw<TypeParam>{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})}, {}}, null_at(1)};
-  auto expected_values = fwcw<R>{3, 9};
+    {{}, lcw<TypeParam>{{{1, 2, 3}, {}, {4, 5}, {}, {6, 0}}, nulls_at({1, 3})}}, null_at(0)};
+  auto expected_values = fwcw<R>{9, 3};
 
   test_sum_agg(keys, values, expected_keys, expected_values);
 }
diff --git a/cpp/tests/groupby/rank_scan_tests.cpp b/cpp/tests/groupby/rank_scan_tests.cpp
index d4e8b4cbf0f..c9f31576aff 100644
--- a/cpp/tests/groupby/rank_scan_tests.cpp
+++ b/cpp/tests/groupby/rank_scan_tests.cpp
@@ -508,65 +508,60 @@ TEST_F(groupby_rank_scan_test_failures, DISABLED_test_exception_triggers)
   auto const keys = input<T>{{1, 2, 3}, null_at(2)};
   auto const col  = input<T>{3, 3, 1};
 
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
-                     null_policy::INCLUDE,
-                     sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
-                     null_policy::INCLUDE,
-                     sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
-                     null_policy::EXCLUDE,
-                     sorted::YES),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
-                     null_policy::EXCLUDE,
-                     sorted::YES),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
-                     null_policy::EXCLUDE,
-                     sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
-
-  CUDF_EXPECT_THROW_MESSAGE(
-    test_single_scan(keys,
-                     col,
-                     keys,
-                     col,
-                     make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
-                     null_policy::EXCLUDE,
-                     sorted::NO),
-    "Rank aggregate in groupby scan requires the keys to be presorted");
+  // All of these aggregations raise exceptions unless provided presorted keys
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
+                                null_policy::INCLUDE,
+                                sorted::NO),
+               cudf::logic_error);
+
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
+                                null_policy::INCLUDE,
+                                sorted::NO),
+               cudf::logic_error);
+
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
+                                null_policy::EXCLUDE,
+                                sorted::YES),
+               cudf::logic_error);
+
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
+                                null_policy::EXCLUDE,
+                                sorted::YES),
+               cudf::logic_error);
+
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::DENSE),
+                                null_policy::EXCLUDE,
+                                sorted::NO),
+               cudf::logic_error);
+
+  EXPECT_THROW(test_single_scan(keys,
+                                col,
+                                keys,
+                                col,
+                                make_rank_aggregation<groupby_scan_aggregation>(rank_method::MIN),
+                                null_policy::EXCLUDE,
+                                sorted::NO),
+               cudf::logic_error);
 }
 
 }  // namespace test
diff --git a/cpp/tests/groupby/tdigest_tests.cu b/cpp/tests/groupby/tdigest_tests.cu
index 736a1096fd1..70b0851c814 100644
--- a/cpp/tests/groupby/tdigest_tests.cu
+++ b/cpp/tests/groupby/tdigest_tests.cu
@@ -17,7 +17,7 @@
 #include <cudf/detail/aggregation/aggregation.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
 #include <cudf/lists/lists_column_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <cudf_test/base_fixture.hpp>
@@ -74,7 +74,7 @@ struct tdigest_groupby_simple_op {
     // make a simple set of matching keys.
     auto keys = cudf::make_fixed_width_column(
       data_type{type_id::INT32}, values.size(), mask_state::UNALLOCATED);
-    thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+    thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                  keys->mutable_view().template begin<int>(),
                  keys->mutable_view().template end<int>(),
                  0);
@@ -100,7 +100,7 @@ struct tdigest_groupby_simple_merge_op {
     // make a simple set of matching keys.
     auto merge_keys = cudf::make_fixed_width_column(
       data_type{type_id::INT32}, merge_values.size(), mask_state::UNALLOCATED);
-    thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+    thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                  merge_keys->mutable_view().template begin<int>(),
                  merge_keys->mutable_view().template end<int>(),
                  0);
@@ -272,7 +272,7 @@ TEST_F(TDigestMergeTest, Grouped)
     data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
   // 3 groups. 0-250000 in group 0.  250000-500000 in group 1 and 500000-750000 in group 1
   auto key_iter = cudf::detail::make_counting_transform_iterator(0, key_groups{});
-  thrust::copy(rmm::exec_policy(cudf::default_stream_value),
+  thrust::copy(rmm::exec_policy(cudf::get_default_stream()),
                key_iter,
                key_iter + keys->size(),
                keys->mutable_view().template begin<int>());
@@ -466,13 +466,13 @@ TEST_F(TDigestMergeTest, EmptyGroups)
   cudf::test::fixed_width_column_wrapper<int> keys{0, 0, 0, 0, 0, 0, 0};
   int const delta = 1000;
 
-  auto a = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto a = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
   auto b = cudf::type_dispatcher(
     static_cast<column_view>(values_b).type(), tdigest_gen_grouped{}, keys, values_b, delta);
-  auto c = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto c = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
   auto d = cudf::type_dispatcher(
     static_cast<column_view>(values_d).type(), tdigest_gen_grouped{}, keys, values_d, delta);
-  auto e = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto e = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
 
   std::vector<column_view> cols;
   cols.push_back(*a);
diff --git a/cpp/tests/hash_map/map_test.cu b/cpp/tests/hash_map/map_test.cu
index 84e64027c5d..f38c5b3f58f 100644
--- a/cpp/tests/hash_map/map_test.cu
+++ b/cpp/tests/hash_map/map_test.cu
@@ -56,13 +56,13 @@ struct InsertTest : public cudf::test::BaseFixture {
     // prevent overflow of small types
     const size_t input_size =
       std::min(static_cast<key_type>(size), std::numeric_limits<key_type>::max());
-    pairs.resize(input_size, cudf::default_stream_value);
-    map = std::move(map_type::create(compute_hash_table_size(size)));
-    cudf::default_stream_value.synchronize();
+    pairs.resize(input_size, cudf::get_default_stream());
+    map = std::move(map_type::create(compute_hash_table_size(size), cudf::get_default_stream()));
+    cudf::get_default_stream().synchronize();
   }
 
   const cudf::size_type size{10000};
-  rmm::device_uvector<pair_type> pairs{static_cast<std::size_t>(size), cudf::default_stream_value};
+  rmm::device_uvector<pair_type> pairs{static_cast<std::size_t>(size), cudf::get_default_stream()};
   std::unique_ptr<map_type, std::function<void(map_type*)>> map;
 };
 
@@ -140,18 +140,18 @@ TYPED_TEST(InsertTest, UniqueKeysUniqueValues)
 {
   using map_type  = typename TypeParam::map_type;
   using pair_type = typename TypeParam::pair_type;
-  thrust::tabulate(rmm::exec_policy(cudf::default_stream_value),
+  thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()),
                    this->pairs.begin(),
                    this->pairs.end(),
                    unique_pair_generator<pair_type>{});
   // All pairs should be new inserts
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                              this->pairs.begin(),
                              this->pairs.end(),
                              insert_pair<map_type, pair_type>{*this->map}));
 
   // All pairs should be present in the map
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                              this->pairs.begin(),
                              this->pairs.end(),
                              find_pair<map_type, pair_type>{*this->map}));
@@ -161,23 +161,23 @@ TYPED_TEST(InsertTest, IdenticalKeysIdenticalValues)
 {
   using map_type  = typename TypeParam::map_type;
   using pair_type = typename TypeParam::pair_type;
-  thrust::tabulate(rmm::exec_policy(cudf::default_stream_value),
+  thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()),
                    this->pairs.begin(),
                    this->pairs.end(),
                    identical_pair_generator<pair_type>{});
   // Insert a single pair
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                              this->pairs.begin(),
                              this->pairs.begin() + 1,
                              insert_pair<map_type, pair_type>{*this->map}));
   // Identical inserts should all return false (no new insert)
-  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                               this->pairs.begin(),
                               this->pairs.end(),
                               insert_pair<map_type, pair_type>{*this->map}));
 
   // All pairs should be present in the map
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                              this->pairs.begin(),
                              this->pairs.end(),
                              find_pair<map_type, pair_type>{*this->map}));
@@ -187,30 +187,30 @@ TYPED_TEST(InsertTest, IdenticalKeysUniqueValues)
 {
   using map_type  = typename TypeParam::map_type;
   using pair_type = typename TypeParam::pair_type;
-  thrust::tabulate(rmm::exec_policy(cudf::default_stream_value),
+  thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()),
                    this->pairs.begin(),
                    this->pairs.end(),
                    identical_key_generator<pair_type>{});
 
   // Insert a single pair
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                              this->pairs.begin(),
                              this->pairs.begin() + 1,
                              insert_pair<map_type, pair_type>{*this->map}));
 
   // Identical key inserts should all return false (no new insert)
-  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                               this->pairs.begin() + 1,
                               this->pairs.end(),
                               insert_pair<map_type, pair_type>{*this->map}));
 
   // Only first pair is present in map
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                              this->pairs.begin(),
                              this->pairs.begin() + 1,
                              find_pair<map_type, pair_type>{*this->map}));
 
-  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  EXPECT_FALSE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                               this->pairs.begin() + 1,
                               this->pairs.end(),
                               find_pair<map_type, pair_type>{*this->map}));
diff --git a/cpp/tests/hashing/hash_test.cpp b/cpp/tests/hashing/hash_test.cpp
index baa7ba07ee4..c1a73761e8d 100644
--- a/cpp/tests/hashing/hash_test.cpp
+++ b/cpp/tests/hashing/hash_test.cpp
@@ -24,37 +24,35 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-using cudf::test::fixed_width_column_wrapper;
-using cudf::test::strings_column_wrapper;
-using namespace cudf::test;
-using namespace cudf::test::iterators;
-
-constexpr debug_output_level verbosity{debug_output_level::ALL_ERRORS};
+constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
 
 class HashTest : public cudf::test::BaseFixture {
 };
 
 TEST_F(HashTest, MultiValue)
 {
-  strings_column_wrapper const strings_col({"",
-                                            "The quick brown fox",
-                                            "jumps over the lazy dog.",
-                                            "All work and no play makes Jack a dull boy",
-                                            R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
 
   using limits = std::numeric_limits<int32_t>;
-  fixed_width_column_wrapper<int32_t> const ints_col({0, 100, -100, limits::min(), limits::max()});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, 100, -100, limits::min(), limits::max()});
 
   // Different truth values should be equal
-  fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
 
   using ts = cudf::timestamp_s;
-  fixed_width_column_wrapper<ts, ts::duration> const secs_col({ts::duration::zero(),
-                                                               static_cast<ts::duration>(100),
-                                                               static_cast<ts::duration>(-100),
-                                                               ts::duration::min(),
-                                                               ts::duration::max()});
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(100),
+     static_cast<ts::duration>(-100),
+     ts::duration::min(),
+     ts::duration::max()});
 
   auto const input1 = cudf::table_view({strings_col, ints_col, bools_col1, secs_col});
   auto const input2 = cudf::table_view({strings_col, ints_col, bools_col2, secs_col});
@@ -69,45 +67,49 @@ TEST_F(HashTest, MultiValue)
 TEST_F(HashTest, MultiValueNulls)
 {
   // Nulls with different values should be equal
-  strings_column_wrapper const strings_col1({"",
-                                             "The quick brown fox",
-                                             "jumps over the lazy dog.",
-                                             "All work and no play makes Jack a dull boy",
-                                             R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-                                            {0, 1, 1, 0, 1});
-  strings_column_wrapper const strings_col2({"different but null",
-                                             "The quick brown fox",
-                                             "jumps over the lazy dog.",
-                                             "I am Jack's complete lack of null value",
-                                             R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
-                                            {0, 1, 1, 0, 1});
+  cudf::test::strings_column_wrapper const strings_col1(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
+    {0, 1, 1, 0, 1});
+  cudf::test::strings_column_wrapper const strings_col2(
+    {"different but null",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "I am Jack's complete lack of null value",
+     R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
+    {0, 1, 1, 0, 1});
 
   // Nulls with different values should be equal
   using limits = std::numeric_limits<int32_t>;
-  fixed_width_column_wrapper<int32_t> const ints_col1({0, 100, -100, limits::min(), limits::max()},
-                                                      {1, 0, 0, 1, 1});
-  fixed_width_column_wrapper<int32_t> const ints_col2({0, -200, 200, limits::min(), limits::max()},
-                                                      {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
+    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
+    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
 
   // Nulls with different values should be equal
   // Different truth values should be equal
-  fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
-  fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
 
   // Nulls with different values should be equal
   using ts = cudf::timestamp_s;
-  fixed_width_column_wrapper<ts, ts::duration> const secs_col1({ts::duration::zero(),
-                                                                static_cast<ts::duration>(100),
-                                                                static_cast<ts::duration>(-100),
-                                                                ts::duration::min(),
-                                                                ts::duration::max()},
-                                                               {1, 0, 0, 1, 1});
-  fixed_width_column_wrapper<ts, ts::duration> const secs_col2({ts::duration::zero(),
-                                                                static_cast<ts::duration>(-200),
-                                                                static_cast<ts::duration>(200),
-                                                                ts::duration::min(),
-                                                                ts::duration::max()},
-                                                               {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col1(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(100),
+     static_cast<ts::duration>(-100),
+     ts::duration::min(),
+     ts::duration::max()},
+    {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<ts, ts::duration> const secs_col2(
+    {ts::duration::zero(),
+     static_cast<ts::duration>(-200),
+     static_cast<ts::duration>(200),
+     ts::duration::min(),
+     ts::duration::max()},
+    {1, 0, 0, 1, 1});
 
   auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1, secs_col1});
   auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2, secs_col2});
@@ -355,7 +357,7 @@ TYPED_TEST_SUITE(HashTestTyped, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(HashTestTyped, Equality)
 {
-  fixed_width_column_wrapper<TypeParam, int32_t> const col{0, 127, 1, 2, 8};
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> const col{0, 127, 1, 2, 8};
   auto const input = cudf::table_view({col});
 
   // Hash of same input should be equal
@@ -377,8 +379,8 @@ TYPED_TEST(HashTestTyped, EqualityNulls)
   using T = TypeParam;
 
   // Nulls with different values should be equal
-  fixed_width_column_wrapper<T, int32_t> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-  fixed_width_column_wrapper<T, int32_t> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
 
   auto const input1 = cudf::table_view({col1});
   auto const input2 = cudf::table_view({col2});
@@ -410,10 +412,11 @@ TYPED_TEST(HashTestFloatTyped, TestExtremes)
   T nan   = std::numeric_limits<T>::quiet_NaN();
   T inf   = std::numeric_limits<T>::infinity();
 
-  fixed_width_column_wrapper<T> const col({T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  fixed_width_column_wrapper<T> const col_neg_zero(
+  cudf::test::fixed_width_column_wrapper<T> const col(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col_neg_zero(
     {T(-0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  fixed_width_column_wrapper<T> const col_neg_nan(
+  cudf::test::fixed_width_column_wrapper<T> const col_neg_nan(
     {T(0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
 
   auto const table_col          = cudf::table_view({col});
@@ -505,76 +508,77 @@ TEST_F(SparkMurmurHash3Test, MultiValueWithSeeds)
   println(s"combined => ${df.select(hash(col("*"))).collect.mkString(",")}")
   */
 
-  fixed_width_column_wrapper<int32_t> const hash_structs_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_structs_expected(
     {-105406170, 90479889, -678041645, 1667387937, 301478567});
-  fixed_width_column_wrapper<int32_t> const hash_strings_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected(
     {142593372, 1217302703, -715697185, -2061143941, -111635966});
-  fixed_width_column_wrapper<int32_t> const hash_doubles_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_doubles_expected(
     {-1670924195, -853646085, -1281358385, 1897734433, -508695674});
-  fixed_width_column_wrapper<int32_t> const hash_timestamps_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_timestamps_expected(
     {-1670924195, 1114849490, 904948192, -1832979433, 1752430209});
-  fixed_width_column_wrapper<int32_t> const hash_decimal64_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal64_expected(
     {-1670924195, 1114849490, 904948192, 1962370902, -1795328666});
-  fixed_width_column_wrapper<int32_t> const hash_longs_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_longs_expected(
     {-1670924195, 1114849490, 904948192, -853646085, -1604625029});
-  fixed_width_column_wrapper<int32_t> const hash_floats_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_floats_expected(
     {933211791, 723455942, -349261430, -1225560532, -338752985});
-  fixed_width_column_wrapper<int32_t> const hash_dates_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_dates_expected(
     {933211791, 751823303, -1080202046, -1906567553, -1503850410});
-  fixed_width_column_wrapper<int32_t> const hash_decimal32_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal32_expected(
     {-1670924195, 1114849490, 904948192, -1454351396, -193774131});
-  fixed_width_column_wrapper<int32_t> const hash_ints_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_ints_expected(
     {933211791, 751823303, -1080202046, 723455942, 133916647});
-  fixed_width_column_wrapper<int32_t> const hash_shorts_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_shorts_expected(
     {933211791, 751823303, -1080202046, -1871935946, 1249274084});
-  fixed_width_column_wrapper<int32_t> const hash_bytes_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bytes_expected(
     {933211791, 751823303, -1080202046, 1110053733, 1135925485});
-  fixed_width_column_wrapper<int32_t> const hash_bools_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_bools_expected(
     {933211791, -559580957, -559580957, -559580957, 933211791});
-  fixed_width_column_wrapper<int32_t> const hash_decimal128_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_decimal128_expected(
     {-783713497, -295670906, 1398487324, -52622807, -1359749815});
-  fixed_width_column_wrapper<int32_t> const hash_combined_expected(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_combined_expected(
     {401603227, 588162166, 552160517, 1132537411, -326043017});
 
   using double_limits = std::numeric_limits<double>;
   using long_limits   = std::numeric_limits<int64_t>;
   using float_limits  = std::numeric_limits<float>;
   using int_limits    = std::numeric_limits<int32_t>;
-  fixed_width_column_wrapper<int32_t> a_col{0, 100, -100, 0x1234'5678, -0x7654'3210};
-  strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"};
-  fixed_width_column_wrapper<float> x_col{
+  cudf::test::fixed_width_column_wrapper<int32_t> a_col{0, 100, -100, 0x1234'5678, -0x7654'3210};
+  cudf::test::strings_column_wrapper b_col{"a", "bc", "def", "ghij", "klmno"};
+  cudf::test::fixed_width_column_wrapper<float> x_col{
     0.f, 100.f, -100.f, float_limits::infinity(), -float_limits::infinity()};
-  fixed_width_column_wrapper<int64_t> y_col{
+  cudf::test::fixed_width_column_wrapper<int64_t> y_col{
     0L, 100L, -100L, 0x0123'4567'89ab'cdefL, -0x0123'4567'89ab'cdefL};
-  structs_column_wrapper c_col{{x_col, y_col}};
-  structs_column_wrapper const structs_col{{a_col, b_col, c_col}};
-
-  strings_column_wrapper const strings_col({"",
-                                            "The quick brown fox",
-                                            "jumps over the lazy dog.",
-                                            "All work and no play makes Jack a dull boy",
-                                            "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
-  fixed_width_column_wrapper<double> const doubles_col(
+  cudf::test::structs_column_wrapper c_col{{x_col, y_col}};
+  cudf::test::structs_column_wrapper const structs_col{{a_col, b_col, c_col}};
+
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
+  cudf::test::fixed_width_column_wrapper<double> const doubles_col(
     {0., -0., -double_limits::quiet_NaN(), double_limits::lowest(), double_limits::max()});
-  fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> const timestamps_col(
-    {0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000});
-  fixed_point_column_wrapper<int64_t> const decimal64_col(
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_ms, cudf::timestamp_ms::rep> const
+    timestamps_col({0L, 100L, -100L, long_limits::min() / 1000000, long_limits::max() / 1000000});
+  cudf::test::fixed_point_column_wrapper<int64_t> const decimal64_col(
     {0L, 100L, -100L, -999999999999999999L, 999999999999999999L}, numeric::scale_type{-7});
-  fixed_width_column_wrapper<int64_t> const longs_col(
+  cudf::test::fixed_width_column_wrapper<int64_t> const longs_col(
     {0L, 100L, -100L, long_limits::min(), long_limits::max()});
-  fixed_width_column_wrapper<float> const floats_col(
+  cudf::test::fixed_width_column_wrapper<float> const floats_col(
     {0.f, -0.f, -float_limits::quiet_NaN(), float_limits::lowest(), float_limits::max()});
-  fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> dates_col(
+  cudf::test::fixed_width_column_wrapper<cudf::timestamp_D, cudf::timestamp_D::rep> dates_col(
     {0, 100, -100, int_limits::min() / 100, int_limits::max() / 100});
-  fixed_point_column_wrapper<int32_t> const decimal32_col({0, 100, -100, -999999999, 999999999},
-                                                          numeric::scale_type{-3});
-  fixed_width_column_wrapper<int32_t> const ints_col(
+  cudf::test::fixed_point_column_wrapper<int32_t> const decimal32_col(
+    {0, 100, -100, -999999999, 999999999}, numeric::scale_type{-3});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
     {0, 100, -100, int_limits::min(), int_limits::max()});
-  fixed_width_column_wrapper<int16_t> const shorts_col({0, 100, -100, -32768, 32767});
-  fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
-  fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
-  fixed_point_column_wrapper<__int128_t> const decimal128_col(
+  cudf::test::fixed_width_column_wrapper<int16_t> const shorts_col({0, 100, -100, -32768, 32767});
+  cudf::test::fixed_width_column_wrapper<int8_t> const bytes_col({0, 100, -100, -128, 127});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+  cudf::test::fixed_point_column_wrapper<__int128_t> const decimal128_col(
     {static_cast<__int128>(0),
      static_cast<__int128>(100),
      static_cast<__int128>(-1),
@@ -644,14 +648,15 @@ TEST_F(SparkMurmurHash3Test, StringsWithSeed)
   //   .map(org.apache.spark.sql.catalyst.expressions.Murmur3HashFunction.hash(
   //     _, org.apache.spark.sql.types.StringType, 314)))
 
-  fixed_width_column_wrapper<int32_t> const hash_strings_expected_seed_314(
+  cudf::test::fixed_width_column_wrapper<int32_t> const hash_strings_expected_seed_314(
     {1467149710, 723257560, -1620282500, -2001858707, 1588473657});
 
-  strings_column_wrapper const strings_col({"",
-                                            "The quick brown fox",
-                                            "jumps over the lazy dog.",
-                                            "All work and no play makes Jack a dull boy",
-                                            "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
+  cudf::test::strings_column_wrapper const strings_col(
+    {"",
+     "The quick brown fox",
+     "jumps over the lazy dog.",
+     "All work and no play makes Jack a dull boy",
+     "!\"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~\ud720\ud721"});
 
   constexpr auto hasher   = cudf::hash_id::HASH_SPARK_MURMUR3;
   auto const hash_strings = cudf::hash(cudf::table_view({strings_col}), hasher, 314);
@@ -691,27 +696,28 @@ TEST_F(SparkMurmurHash3Test, ListValues)
   df2.show(false)
   */
 
-  auto const null  = -1;
-  auto nested_list = cudf::test::lists_column_wrapper<int>({{},
-                                                            {1},
-                                                            {1, 2},
-                                                            {1, 2, 3},
-                                                            {1, 2},
-                                                            {3},
-                                                            {1},
-                                                            {2, 3},
-                                                            {1},
-                                                            {{null, 2, 3}, nulls_at({0})},
-                                                            {1, 2},
-                                                            {3},
-                                                            {{null}, nulls_at({0})},
-                                                            {1, 2},
-                                                            {},
-                                                            {3}},
-                                                           nulls_at({0, 14}));
+  auto const null = -1;
+  auto nested_list =
+    cudf::test::lists_column_wrapper<int>({{},
+                                           {1},
+                                           {1, 2},
+                                           {1, 2, 3},
+                                           {1, 2},
+                                           {3},
+                                           {1},
+                                           {2, 3},
+                                           {1},
+                                           {{null, 2, 3}, cudf::test::iterators::nulls_at({0})},
+                                           {1, 2},
+                                           {3},
+                                           {{null}, cudf::test::iterators::nulls_at({0})},
+                                           {1, 2},
+                                           {},
+                                           {3}},
+                                          cudf::test::iterators::nulls_at({0, 14}));
   auto offsets =
     cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 0, 0, 1, 2, 3, 4, 6, 8, 10, 13, 16};
-  auto list_validity        = nulls_at({0});
+  auto list_validity        = cudf::test::iterators::nulls_at({0});
   auto list_validity_buffer = cudf::test::detail::make_null_mask(list_validity, list_validity + 11);
   auto list_column          = cudf::make_lists_column(11,
                                              offsets.release(),
@@ -766,11 +772,18 @@ TEST_F(SparkMurmurHash3Test, StructOfListValues)
   */
 
   auto const null = -1;
-  auto col1       = cudf::test::lists_column_wrapper<int>(
-    {{}, {0}, {{1, null}, nulls_at({1})}, {{1, null}, nulls_at({1})}, {}, {} /*NULL*/, {2, 3}},
-    nulls_at({5}));
+  auto col1 =
+    cudf::test::lists_column_wrapper<int>({{},
+                                           {0},
+                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
+                                           {{1, null}, cudf::test::iterators::nulls_at({1})},
+                                           {},
+                                           {} /*NULL*/,
+                                           {2, 3}},
+                                          cudf::test::iterators::nulls_at({5}));
   auto col2 = cudf::test::lists_column_wrapper<int>(
-    {{}, {0}, {} /*NULL*/, {}, {{null, 1}, nulls_at({0})}, {1}, {4, 5}}, nulls_at({2}));
+    {{}, {0}, {} /*NULL*/, {}, {{null, 1}, cudf::test::iterators::nulls_at({0})}, {1}, {4, 5}},
+    cudf::test::iterators::nulls_at({2}));
   auto struct_column = cudf::test::structs_column_wrapper{{col1, col2}};
 
   auto expect = cudf::test::fixed_width_column_wrapper<int32_t>{
@@ -813,12 +826,15 @@ TEST_F(SparkMurmurHash3Test, ListOfStructValues)
   */
 
   auto const null = -1;
-  auto col1 = fixed_width_column_wrapper<int32_t>({0, null, null, 1, null, null, 2, 2, null, 2, 4},
-                                                  nulls_at({1, 2, 4, 5, 8}));
-  auto col2 = fixed_width_column_wrapper<int32_t>({0, null, null, null, 1, 1, 3, 3, null, 3, 5},
-                                                  nulls_at({1, 2, 3, 8}));
-  auto struct_column = structs_column_wrapper{{col1, col2}, {1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1}};
-  auto offsets       = fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3, 4, 5, 7, 9, 11};
+  auto col1       = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {0, null, null, 1, null, null, 2, 2, null, 2, 4},
+    cudf::test::iterators::nulls_at({1, 2, 4, 5, 8}));
+  auto col2 = cudf::test::fixed_width_column_wrapper<int32_t>(
+    {0, null, null, null, 1, 1, 3, 3, null, 3, 5}, cudf::test::iterators::nulls_at({1, 2, 3, 8}));
+  auto struct_column =
+    cudf::test::structs_column_wrapper{{col1, col2}, {1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1}};
+  auto offsets =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 2, 3, 4, 5, 7, 9, 11};
   auto list_nullmask = std::vector<bool>(1, 8);
   auto list_validity_buffer =
     cudf::test::detail::make_null_mask(list_nullmask.begin(), list_nullmask.end());
@@ -847,7 +863,7 @@ class MD5HashTest : public cudf::test::BaseFixture {
 
 TEST_F(MD5HashTest, MultiValue)
 {
-  strings_column_wrapper const strings_col(
+  cudf::test::strings_column_wrapper const strings_col(
     {"",
      "A 60 character string to test MD5's message padding algorithm",
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
@@ -855,24 +871,27 @@ TEST_F(MD5HashTest, MultiValue)
      "All work and no play makes Jack a dull boy",
      R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
 
-  strings_column_wrapper const md5_string_results1({"d41d8cd98f00b204e9800998ecf8427e",
-                                                    "682240021651ae166d08fe2a014d5c09",
-                                                    "3669d5225fddbb34676312ca3b78bbd9",
-                                                    "c61a4185135eda043f35e92c3505e180",
-                                                    "52da74c75cb6575d25be29e66bd0adde"});
+  cudf::test::strings_column_wrapper const md5_string_results1(
+    {"d41d8cd98f00b204e9800998ecf8427e",
+     "682240021651ae166d08fe2a014d5c09",
+     "3669d5225fddbb34676312ca3b78bbd9",
+     "c61a4185135eda043f35e92c3505e180",
+     "52da74c75cb6575d25be29e66bd0adde"});
 
-  strings_column_wrapper const md5_string_results2({"d41d8cd98f00b204e9800998ecf8427e",
-                                                    "e5a5682e82278e78dbaad9a689df7a73",
-                                                    "4121ab1bb6e84172fd94822645862ae9",
-                                                    "28970886501efe20164213855afe5850",
-                                                    "6bc1b872103cc6a02d882245b8516e2e"});
+  cudf::test::strings_column_wrapper const md5_string_results2(
+    {"d41d8cd98f00b204e9800998ecf8427e",
+     "e5a5682e82278e78dbaad9a689df7a73",
+     "4121ab1bb6e84172fd94822645862ae9",
+     "28970886501efe20164213855afe5850",
+     "6bc1b872103cc6a02d882245b8516e2e"});
 
   using limits = std::numeric_limits<int32_t>;
-  fixed_width_column_wrapper<int32_t> const ints_col({0, 100, -100, limits::min(), limits::max()});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col(
+    {0, 100, -100, limits::min(), limits::max()});
 
   // Different truth values should be equal
-  fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
-  fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 1, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 1, 2, 255, 0});
 
   auto const string_input1      = cudf::table_view({strings_col});
   auto const string_input2      = cudf::table_view({strings_col, strings_col});
@@ -894,7 +913,7 @@ TEST_F(MD5HashTest, MultiValue)
 TEST_F(MD5HashTest, MultiValueNulls)
 {
   // Nulls with different values should be equal
-  strings_column_wrapper const strings_col1(
+  cudf::test::strings_column_wrapper const strings_col1(
     {"",
      "Different but null!",
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
@@ -902,7 +921,7 @@ TEST_F(MD5HashTest, MultiValueNulls)
      "All work and no play makes Jack a dull boy",
      R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"},
     {1, 0, 0, 1, 0});
-  strings_column_wrapper const strings_col2(
+  cudf::test::strings_column_wrapper const strings_col2(
     {"",
      "A 60 character string to test MD5's message padding algorithm",
      "Very different... but null",
@@ -912,15 +931,15 @@ TEST_F(MD5HashTest, MultiValueNulls)
 
   // Nulls with different values should be equal
   using limits = std::numeric_limits<int32_t>;
-  fixed_width_column_wrapper<int32_t> const ints_col1({0, 100, -100, limits::min(), limits::max()},
-                                                      {1, 0, 0, 1, 1});
-  fixed_width_column_wrapper<int32_t> const ints_col2({0, -200, 200, limits::min(), limits::max()},
-                                                      {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col1(
+    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> const ints_col2(
+    {0, -200, 200, limits::min(), limits::max()}, {1, 0, 0, 1, 1});
 
   // Nulls with different values should be equal
   // Different truth values should be equal
-  fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
-  fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col1({0, 1, 0, 1, 1}, {1, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const bools_col2({0, 2, 1, 0, 255}, {1, 1, 0, 0, 1});
 
   auto const input1 = cudf::table_view({strings_col1, ints_col1, bools_col1});
   auto const input2 = cudf::table_view({strings_col2, ints_col2, bools_col2});
@@ -936,7 +955,7 @@ TEST_F(MD5HashTest, StringListsNulls)
 {
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 0; });
 
-  strings_column_wrapper const strings_col(
+  cudf::test::strings_column_wrapper const strings_col(
     {"",
      "A 60 character string to test MD5's message padding algorithm",
      "A very long (greater than 128 bytes/char string) to test a multi hash-step data point in the "
@@ -944,7 +963,7 @@ TEST_F(MD5HashTest, StringListsNulls)
      "All work and no play makes Jack a dull boy",
      R"(!"#$%&'()*+,-./0123456789:;<=>?@[\]^_`{|}~)"});
 
-  lists_column_wrapper<cudf::string_view> strings_list_col(
+  cudf::test::lists_column_wrapper<cudf::string_view> strings_list_col(
     {{""},
      {{"NULL", "A 60 character string to test MD5's message padding algorithm"}, validity},
      {"A very long (greater than 128 bytes/char string) to test a multi hash-step data point in "
@@ -971,7 +990,7 @@ TYPED_TEST_SUITE(MD5HashTestTyped, cudf::test::NumericTypes);
 
 TYPED_TEST(MD5HashTestTyped, Equality)
 {
-  fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8});
+  cudf::test::fixed_width_column_wrapper<TypeParam> const col({0, 127, 1, 2, 8});
   auto const input = cudf::table_view({col});
 
   // Hash of same input should be equal
@@ -987,8 +1006,8 @@ TYPED_TEST(MD5HashTestTyped, EqualityNulls)
   using T = TypeParam;
 
   // Nulls with different values should be equal
-  fixed_width_column_wrapper<T> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
-  fixed_width_column_wrapper<T> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T> const col1({0, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T> const col2({1, 127, 1, 2, 8}, {0, 1, 1, 1, 1});
 
   auto const input1 = cudf::table_view({col1});
   auto const input2 = cudf::table_view({col2});
@@ -1002,15 +1021,15 @@ TYPED_TEST(MD5HashTestTyped, EqualityNulls)
 
 TEST_F(MD5HashTest, TestBoolListsWithNulls)
 {
-  fixed_width_column_wrapper<bool> const col1({0, 255, 255, 16, 27, 18, 100, 1, 2},
-                                              {1, 0, 0, 0, 1, 1, 1, 0, 0});
-  fixed_width_column_wrapper<bool> const col2({0, 255, 255, 32, 81, 68, 3, 101, 4},
-                                              {1, 0, 0, 1, 0, 1, 0, 1, 0});
-  fixed_width_column_wrapper<bool> const col3({0, 255, 255, 64, 49, 42, 5, 6, 102},
-                                              {1, 0, 0, 1, 1, 0, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<bool> const col1({0, 255, 255, 16, 27, 18, 100, 1, 2},
+                                                          {1, 0, 0, 0, 1, 1, 1, 0, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const col2({0, 255, 255, 32, 81, 68, 3, 101, 4},
+                                                          {1, 0, 0, 1, 0, 1, 0, 1, 0});
+  cudf::test::fixed_width_column_wrapper<bool> const col3({0, 255, 255, 64, 49, 42, 5, 6, 102},
+                                                          {1, 0, 0, 1, 1, 0, 0, 0, 1});
 
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
-  lists_column_wrapper<bool> const list_col(
+  cudf::test::lists_column_wrapper<bool> const list_col(
     {{0, 0, 0}, {1}, {}, {{1, 1, 1}, validity}, {1, 1}, {1, 1}, {1}, {1}, {1}}, validity);
 
   auto const input1 = cudf::table_view({col1, col2, col3});
@@ -1027,22 +1046,23 @@ template <typename T>
 class MD5HashListTestTyped : public cudf::test::BaseFixture {
 };
 
-using NumericTypesNoBools = Concat<IntegralTypesNotBool, FloatingPointTypes>;
+using NumericTypesNoBools =
+  cudf::test::Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes>;
 TYPED_TEST_SUITE(MD5HashListTestTyped, NumericTypesNoBools);
 
 TYPED_TEST(MD5HashListTestTyped, TestListsWithNulls)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> const col1({0, 255, 255, 16, 27, 18, 100, 1, 2},
-                                           {1, 0, 0, 0, 1, 1, 1, 0, 0});
-  fixed_width_column_wrapper<T> const col2({0, 255, 255, 32, 81, 68, 3, 101, 4},
-                                           {1, 0, 0, 1, 0, 1, 0, 1, 0});
-  fixed_width_column_wrapper<T> const col3({0, 255, 255, 64, 49, 42, 5, 6, 102},
-                                           {1, 0, 0, 1, 1, 0, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<T> const col1({0, 255, 255, 16, 27, 18, 100, 1, 2},
+                                                       {1, 0, 0, 0, 1, 1, 1, 0, 0});
+  cudf::test::fixed_width_column_wrapper<T> const col2({0, 255, 255, 32, 81, 68, 3, 101, 4},
+                                                       {1, 0, 0, 1, 0, 1, 0, 1, 0});
+  cudf::test::fixed_width_column_wrapper<T> const col3({0, 255, 255, 64, 49, 42, 5, 6, 102},
+                                                       {1, 0, 0, 1, 1, 0, 0, 0, 1});
 
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 1; });
-  lists_column_wrapper<T> const list_col(
+  cudf::test::lists_column_wrapper<T> const list_col(
     {{0, 0, 0}, {127}, {}, {{32, 127, 64}, validity}, {27, 49}, {18, 68}, {100}, {101}, {102}},
     validity);
 
@@ -1070,8 +1090,9 @@ TYPED_TEST(MD5HashTestFloatTyped, TestExtremes)
   T nan   = std::numeric_limits<T>::quiet_NaN();
   T inf   = std::numeric_limits<T>::infinity();
 
-  fixed_width_column_wrapper<T> const col1({T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
-  fixed_width_column_wrapper<T> const col2(
+  cudf::test::fixed_width_column_wrapper<T> const col1(
+    {T(0.0), T(100.0), T(-100.0), min, max, nan, inf, -inf});
+  cudf::test::fixed_width_column_wrapper<T> const col2(
     {T(-0.0), T(100.0), T(-100.0), min, max, -nan, inf, -inf});
 
   auto const input1 = cudf::table_view({col1});
@@ -1091,9 +1112,9 @@ TYPED_TEST(MD5HashTestFloatTyped, TestListExtremes)
   T nan   = std::numeric_limits<T>::quiet_NaN();
   T inf   = std::numeric_limits<T>::infinity();
 
-  lists_column_wrapper<T> const col1(
+  cudf::test::lists_column_wrapper<T> const col1(
     {{T(0.0)}, {T(100.0), T(-100.0)}, {min, max, nan}, {inf, -inf}});
-  lists_column_wrapper<T> const col2(
+  cudf::test::lists_column_wrapper<T> const col2(
     {{T(-0.0)}, {T(100.0), T(-100.0)}, {min, max, -nan}, {inf, -inf}});
 
   auto const input1 = cudf::table_view({col1});
diff --git a/cpp/tests/interop/dlpack_test.cpp b/cpp/tests/interop/dlpack_test.cpp
index da9f80cf3d7..2862590d05f 100644
--- a/cpp/tests/interop/dlpack_test.cpp
+++ b/cpp/tests/interop/dlpack_test.cpp
@@ -24,8 +24,6 @@
 
 #include <thrust/host_vector.h>
 
-using namespace cudf::test;
-
 struct dlpack_deleter {
   void operator()(DLManagedTensor* tensor) { tensor->deleter(tensor); }
 };
@@ -61,7 +59,7 @@ void validate_dtype(DLDataType const& dtype)
   EXPECT_EQ(sizeof(T) * 8, dtype.bits);
 }
 
-class DLPackUntypedTests : public BaseFixture {
+class DLPackUntypedTests : public cudf::test::BaseFixture {
 };
 
 TEST_F(DLPackUntypedTests, EmptyTableToDlpack)
@@ -73,8 +71,8 @@ TEST_F(DLPackUntypedTests, EmptyTableToDlpack)
 
 TEST_F(DLPackUntypedTests, EmptyColsToDlpack)
 {
-  fixed_width_column_wrapper<int32_t> col1({});
-  fixed_width_column_wrapper<int32_t> col2({});
+  cudf::test::fixed_width_column_wrapper<int32_t> col1({});
+  cudf::test::fixed_width_column_wrapper<int32_t> col2({});
   cudf::table_view input({col1, col2});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
   validate_dtype<int32_t>(tensor->dl_tensor.dtype);
@@ -97,30 +95,30 @@ TEST_F(DLPackUntypedTests, NullTensorFromDlpack)
 
 TEST_F(DLPackUntypedTests, MultipleTypesToDlpack)
 {
-  fixed_width_column_wrapper<int16_t> col1({1, 2, 3, 4});
-  fixed_width_column_wrapper<int32_t> col2({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int16_t> col1({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int32_t> col2({1, 2, 3, 4});
   cudf::table_view input({col1, col2});
   EXPECT_THROW(cudf::to_dlpack(input), cudf::logic_error);
 }
 
 TEST_F(DLPackUntypedTests, InvalidNullsToDlpack)
 {
-  fixed_width_column_wrapper<int32_t> col1({1, 2, 3, 4});
-  fixed_width_column_wrapper<int32_t> col2({1, 2, 3, 4}, {1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> col1({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int32_t> col2({1, 2, 3, 4}, {1, 0, 1, 1});
   cudf::table_view input({col1, col2});
   EXPECT_THROW(cudf::to_dlpack(input), cudf::logic_error);
 }
 
 TEST_F(DLPackUntypedTests, StringTypeToDlpack)
 {
-  strings_column_wrapper col({"foo", "bar", "baz"});
+  cudf::test::strings_column_wrapper col({"foo", "bar", "baz"});
   cudf::table_view input({col});
   EXPECT_THROW(cudf::to_dlpack(input), cudf::logic_error);
 }
 
 TEST_F(DLPackUntypedTests, UnsupportedDeviceTypeFromDlpack)
 {
-  fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
   cudf::table_view input({col});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
@@ -131,7 +129,7 @@ TEST_F(DLPackUntypedTests, UnsupportedDeviceTypeFromDlpack)
 
 TEST_F(DLPackUntypedTests, InvalidDeviceIdFromDlpack)
 {
-  fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
   cudf::table_view input({col});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
@@ -142,7 +140,7 @@ TEST_F(DLPackUntypedTests, InvalidDeviceIdFromDlpack)
 
 TEST_F(DLPackUntypedTests, UnsupportedDimsFromDlpack)
 {
-  fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
   cudf::table_view input({col});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
@@ -153,7 +151,7 @@ TEST_F(DLPackUntypedTests, UnsupportedDimsFromDlpack)
 
 TEST_F(DLPackUntypedTests, TooManyRowsFromDlpack)
 {
-  fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
   cudf::table_view input({col});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
@@ -165,8 +163,8 @@ TEST_F(DLPackUntypedTests, TooManyRowsFromDlpack)
 
 TEST_F(DLPackUntypedTests, TooManyColsFromDlpack)
 {
-  fixed_width_column_wrapper<int32_t> col1({1, 2, 3, 4});
-  fixed_width_column_wrapper<int32_t> col2({5, 6, 7, 8});
+  cudf::test::fixed_width_column_wrapper<int32_t> col1({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int32_t> col2({5, 6, 7, 8});
   cudf::table_view input({col1, col2});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
@@ -178,7 +176,7 @@ TEST_F(DLPackUntypedTests, TooManyColsFromDlpack)
 
 TEST_F(DLPackUntypedTests, InvalidTypeFromDlpack)
 {
-  fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
   cudf::table_view input({col});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
@@ -189,7 +187,7 @@ TEST_F(DLPackUntypedTests, InvalidTypeFromDlpack)
 
 TEST_F(DLPackUntypedTests, UnsupportedIntBitsizeFromDlpack)
 {
-  fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
   cudf::table_view input({col});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
@@ -200,7 +198,7 @@ TEST_F(DLPackUntypedTests, UnsupportedIntBitsizeFromDlpack)
 
 TEST_F(DLPackUntypedTests, UnsupportedFloatBitsizeFromDlpack)
 {
-  fixed_width_column_wrapper<float> col({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<float> col({1, 2, 3, 4});
   cudf::table_view input({col});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
@@ -211,7 +209,7 @@ TEST_F(DLPackUntypedTests, UnsupportedFloatBitsizeFromDlpack)
 
 TEST_F(DLPackUntypedTests, UnsupportedLanesFromDlpack)
 {
-  fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<int32_t> col({1, 2, 3, 4});
   cudf::table_view input({col});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
@@ -335,20 +333,20 @@ TEST_F(DLPackUntypedTests, UnsupportedStridedColMajor2DTensorFromDlpack)
 }
 
 template <typename T>
-class DLPackTimestampTests : public BaseFixture {
+class DLPackTimestampTests : public cudf::test::BaseFixture {
 };
 
-TYPED_TEST_SUITE(DLPackTimestampTests, ChronoTypes);
+TYPED_TEST_SUITE(DLPackTimestampTests, cudf::test::ChronoTypes);
 
 TYPED_TEST(DLPackTimestampTests, ChronoTypesToDlpack)
 {
-  fixed_width_column_wrapper<TypeParam, int32_t> col({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> col({1, 2, 3, 4});
   cudf::table_view input({col});
   EXPECT_THROW(cudf::to_dlpack(input), cudf::logic_error);
 }
 
 template <typename T>
-class DLPackNumericTests : public BaseFixture {
+class DLPackNumericTests : public cudf::test::BaseFixture {
 };
 
 // The list of supported types comes from DLDataType_to_data_type() in cpp/src/dlpack/dlpack.cpp
@@ -360,7 +358,7 @@ TYPED_TEST_SUITE(DLPackNumericTests, SupportedTypes);
 TYPED_TEST(DLPackNumericTests, ToDlpack1D)
 {
   // Test nullable column with no nulls
-  fixed_width_column_wrapper<TypeParam> col({1, 2, 3, 4}, {1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<TypeParam> col({1, 2, 3, 4}, {1, 1, 1, 1});
   auto const col_view = static_cast<cudf::column_view>(col);
   EXPECT_FALSE(col_view.has_nulls());
   EXPECT_TRUE(col_view.nullable());
@@ -389,9 +387,11 @@ TYPED_TEST(DLPackNumericTests, ToDlpack2D)
   using T             = TypeParam;
   auto const col1_tmp = cudf::test::make_type_param_vector<T>({1, 2, 3, 4});
   auto const col2_tmp = cudf::test::make_type_param_vector<T>({4, 5, 6, 7});
-  std::vector<fixed_width_column_wrapper<TypeParam>> cols;
-  cols.push_back(fixed_width_column_wrapper<TypeParam>(col1_tmp.cbegin(), col1_tmp.cend()));
-  cols.push_back(fixed_width_column_wrapper<TypeParam>(col2_tmp.cbegin(), col2_tmp.cend()));
+  std::vector<cudf::test::fixed_width_column_wrapper<TypeParam>> cols;
+  cols.push_back(
+    cudf::test::fixed_width_column_wrapper<TypeParam>(col1_tmp.cbegin(), col1_tmp.cend()));
+  cols.push_back(
+    cudf::test::fixed_width_column_wrapper<TypeParam>(col2_tmp.cbegin(), col2_tmp.cend()));
 
   std::vector<cudf::column_view> col_views;
   std::transform(cols.begin(), cols.end(), std::back_inserter(col_views), [](auto const& col) {
@@ -427,7 +427,7 @@ TYPED_TEST(DLPackNumericTests, ToDlpack2D)
 TYPED_TEST(DLPackNumericTests, FromDlpack1D)
 {
   // Use to_dlpack to generate an input tensor
-  fixed_width_column_wrapper<TypeParam> col({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<TypeParam> col({1, 2, 3, 4});
   cudf::table_view input({col});
   unique_managed_tensor tensor(cudf::to_dlpack(input));
 
@@ -442,9 +442,9 @@ TYPED_TEST(DLPackNumericTests, FromDlpack2D)
   using T         = TypeParam;
   auto const col1 = cudf::test::make_type_param_vector<T>({1, 2, 3, 4});
   auto const col2 = cudf::test::make_type_param_vector<T>({4, 5, 6, 7});
-  std::vector<fixed_width_column_wrapper<TypeParam>> cols;
-  cols.push_back(fixed_width_column_wrapper<T>(col1.cbegin(), col1.cend()));
-  cols.push_back(fixed_width_column_wrapper<T>(col2.cbegin(), col2.cend()));
+  std::vector<cudf::test::fixed_width_column_wrapper<TypeParam>> cols;
+  cols.push_back(cudf::test::fixed_width_column_wrapper<T>(col1.cbegin(), col1.cend()));
+  cols.push_back(cudf::test::fixed_width_column_wrapper<T>(col2.cbegin(), col2.cend()));
 
   std::vector<cudf::column_view> col_views;
   std::transform(cols.begin(), cols.end(), std::back_inserter(col_views), [](auto const& col) {
@@ -479,8 +479,8 @@ TYPED_TEST(DLPackNumericTests, FromDlpackCpu)
   thrust::host_vector<T> host_vector(data.begin(), data.end());
   tensor.dl_tensor.data = host_vector.data();
 
-  fixed_width_column_wrapper<TypeParam> col1({1, 2, 3, 4});
-  fixed_width_column_wrapper<TypeParam> col2({5, 6, 7, 8});
+  cudf::test::fixed_width_column_wrapper<TypeParam> col1({1, 2, 3, 4});
+  cudf::test::fixed_width_column_wrapper<TypeParam> col2({5, 6, 7, 8});
   cudf::table_view expected({col1, col2});
 
   auto result = cudf::from_dlpack(&tensor);
diff --git a/cpp/tests/io/arrow_io_source_test.cpp b/cpp/tests/io/arrow_io_source_test.cpp
index 24964db5f8c..3ef61b0ee26 100644
--- a/cpp/tests/io/arrow_io_source_test.cpp
+++ b/cpp/tests/io/arrow_io_source_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -25,6 +25,7 @@
 #include <cudf/io/json.hpp>
 #include <cudf/io/parquet.hpp>
 
+#include <arrow/filesystem/filesystem.h>
 #include <arrow/io/api.h>
 
 #include <fstream>
@@ -61,36 +62,32 @@ TEST_F(ArrowIOTest, URIFileSystem)
   ASSERT_EQ(2, tbl.tbl->num_rows());
 }
 
-#ifdef S3_ENABLED
-
 TEST_F(ArrowIOTest, S3FileSystem)
 {
   std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
-  std::unique_ptr<cudf::io::arrow_io_source> datasource =
-    std::make_unique<cudf::io::arrow_io_source>(s3_uri);
-
-  // Populate the Parquet Reader Options
-  cudf::io::source_info src(datasource.get());
-  std::vector<std::string> single_column;
-  single_column.insert(single_column.begin(), "total_bill");
-  cudf::io::parquet_reader_options_builder builder(src);
-  cudf::io::parquet_reader_options options = builder.columns(single_column).build();
-
-  // Read the Parquet file from S3
-  cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options);
-
-  ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
-  ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
-}
-
-#else
 
-TEST_F(ArrowIOTest, S3URIWhenNotEnabled)
-{
-  std::string s3_uri = "s3://rapidsai-data/cudf/test/tips.parquet?region=us-east-2";
-  EXPECT_THROW(std::make_unique<cudf::io::arrow_io_source>(s3_uri), cudf::logic_error);
+  // Check to see if Arrow was built with support for S3. If not, ensure this
+  // test throws. If so, validate the S3 file contents.
+  auto const s3_unsupported = arrow::fs::FileSystemFromUri(s3_uri).status().IsNotImplemented();
+  if (s3_unsupported) {
+    EXPECT_THROW(std::make_unique<cudf::io::arrow_io_source>(s3_uri), cudf::logic_error);
+  } else {
+    std::unique_ptr<cudf::io::arrow_io_source> datasource =
+      std::make_unique<cudf::io::arrow_io_source>(s3_uri);
+
+    // Populate the Parquet Reader Options
+    cudf::io::source_info src(datasource.get());
+    std::vector<std::string> single_column;
+    single_column.insert(single_column.begin(), "total_bill");
+    cudf::io::parquet_reader_options_builder builder(src);
+    cudf::io::parquet_reader_options options = builder.columns(single_column).build();
+
+    // Read the Parquet file from S3
+    cudf::io::table_with_metadata tbl = cudf::io::read_parquet(options);
+
+    ASSERT_EQ(1, tbl.tbl->num_columns());  // Only single column specified in reader_options
+    ASSERT_EQ(244, tbl.tbl->num_rows());   // known number of rows from the S3 file
+  }
 }
 
-#endif
-
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/comp/decomp_test.cpp b/cpp/tests/io/comp/decomp_test.cpp
index c51a7854e25..a97f44bce43 100644
--- a/cpp/tests/io/comp/decomp_test.cpp
+++ b/cpp/tests/io/comp/decomp_test.cpp
@@ -16,6 +16,7 @@
 
 #include <io/comp/gpuinflate.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
+#include <src/io/comp/nvcomp_adapter.hpp>
 
 #include <cudf/utilities/default_stream.hpp>
 
@@ -46,7 +47,7 @@ struct DecompressTest : public cudf::test::BaseFixture {
                   const uint8_t* compressed,
                   size_t compressed_size)
   {
-    auto stream = cudf::default_stream_value;
+    auto stream = cudf::get_default_stream();
     rmm::device_buffer src{compressed, compressed_size, stream};
     rmm::device_uvector<uint8_t> dst{decompressed->size(), stream};
 
@@ -82,7 +83,7 @@ struct GzipDecompressTest : public DecompressTest<GzipDecompressTest> {
                          d_inf_out,
                          d_inf_stat,
                          cudf::io::gzip_header_included::YES,
-                         cudf::default_stream_value);
+                         cudf::get_default_stream());
   }
 };
 
@@ -94,7 +95,7 @@ struct SnappyDecompressTest : public DecompressTest<SnappyDecompressTest> {
                 device_span<device_span<uint8_t>> d_inf_out,
                 device_span<cudf::io::compression_result> d_inf_stat)
   {
-    cudf::io::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, cudf::default_stream_value);
+    cudf::io::gpu_unsnap(d_inf_in, d_inf_out, d_inf_stat, cudf::get_default_stream());
   }
 };
 
@@ -107,17 +108,20 @@ struct BrotliDecompressTest : public DecompressTest<BrotliDecompressTest> {
                 device_span<cudf::io::compression_result> d_inf_stat)
   {
     rmm::device_buffer d_scratch{cudf::io::get_gpu_debrotli_scratch_size(1),
-                                 cudf::default_stream_value};
+                                 cudf::get_default_stream()};
 
     cudf::io::gpu_debrotli(d_inf_in,
                            d_inf_out,
                            d_inf_stat,
                            d_scratch.data(),
                            d_scratch.size(),
-                           cudf::default_stream_value);
+                           cudf::get_default_stream());
   }
 };
 
+struct NvcompConfigTest : public cudf::test::BaseFixture {
+};
+
 TEST_F(GzipDecompressTest, HelloWorld)
 {
   constexpr char uncompressed[]  = "hello world";
@@ -166,4 +170,58 @@ TEST_F(BrotliDecompressTest, HelloWorld)
   EXPECT_EQ(output, input);
 }
 
+TEST_F(NvcompConfigTest, Compression)
+{
+  using cudf::io::nvcomp::compression_type;
+  auto const& comp_disabled = cudf::io::nvcomp::is_compression_disabled;
+
+  EXPECT_FALSE(comp_disabled(compression_type::DEFLATE, {2, 5, 0, true, true, 0}));
+  // version 2.5 required
+  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {2, 4, 0, true, true, 0}));
+  // all integrations enabled required
+  EXPECT_TRUE(comp_disabled(compression_type::DEFLATE, {2, 5, 0, false, true, 0}));
+
+  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 0}));
+  EXPECT_FALSE(comp_disabled(compression_type::ZSTD, {2, 4, 0, false, true, 0}));
+  // 2.4 version required
+  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {2, 3, 1, false, true, 0}));
+  // stable integrations enabled required
+  EXPECT_TRUE(comp_disabled(compression_type::ZSTD, {2, 4, 0, false, false, 0}));
+
+  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {2, 5, 0, true, true, 0}));
+  EXPECT_FALSE(comp_disabled(compression_type::SNAPPY, {2, 4, 0, false, true, 0}));
+  // stable integrations enabled required
+  EXPECT_TRUE(comp_disabled(compression_type::SNAPPY, {2, 3, 0, false, false, 0}));
+}
+
+TEST_F(NvcompConfigTest, Decompression)
+{
+  using cudf::io::nvcomp::compression_type;
+  auto const& decomp_disabled = cudf::io::nvcomp::is_decompression_disabled;
+
+  EXPECT_FALSE(decomp_disabled(compression_type::DEFLATE, {2, 5, 0, true, true, 7}));
+  // version 2.5 required
+  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {2, 4, 0, true, true, 7}));
+  // all integrations enabled required
+  EXPECT_TRUE(decomp_disabled(compression_type::DEFLATE, {2, 5, 0, false, true, 7}));
+
+  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 7}));
+  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 3, 2, false, true, 6}));
+  EXPECT_FALSE(decomp_disabled(compression_type::ZSTD, {2, 3, 0, true, true, 6}));
+  // 2.3.1 and earlier requires all integrations to be enabled
+  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 3, 1, false, true, 7}));
+  // 2.3 version required
+  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 2, 0, true, true, 7}));
+  // stable integrations enabled required
+  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, false, false, 7}));
+  // 2.4.0 disabled on Pascal
+  EXPECT_TRUE(decomp_disabled(compression_type::ZSTD, {2, 4, 0, true, true, 6}));
+
+  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 4, 0, true, true, 7}));
+  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 3, 0, false, true, 7}));
+  EXPECT_FALSE(decomp_disabled(compression_type::SNAPPY, {2, 2, 0, false, true, 7}));
+  // stable integrations enabled required
+  EXPECT_TRUE(decomp_disabled(compression_type::SNAPPY, {2, 2, 0, false, false, 7}));
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/csv_test.cpp b/cpp/tests/io/csv_test.cpp
index 4f0bdbd9b31..17fddffc93e 100644
--- a/cpp/tests/io/csv_test.cpp
+++ b/cpp/tests/io/csv_test.cpp
@@ -49,8 +49,6 @@
 #include <string>
 #include <vector>
 
-namespace cudf_io = cudf::io;
-
 using cudf::data_type;
 using cudf::type_id;
 using cudf::type_to_id;
@@ -113,12 +111,12 @@ struct CsvFixedPointReaderTest : public CsvReaderTest {
                                            return acc.empty() ? rhs : (acc + "\n" + rhs);
                                          });
 
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .dtypes({data_type{type_to_id<DecimalType>(), scale}})
         .header(-1);
 
-    const auto result      = cudf_io::read_csv(in_opts);
+    const auto result      = cudf::io::read_csv(in_opts);
     const auto result_view = result.tbl->view();
 
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*input_column, result_view.column(0));
@@ -279,32 +277,14 @@ void expect_column_data_equal(std::vector<T> const& lhs, cudf::column_view const
 
 void write_csv_helper(std::string const& filename,
                       cudf::table_view const& table,
-                      bool include_header,
                       std::vector<std::string> const& names = {})
 {
-  // csv_writer_options only keeps a pointer to metadata (non-owning)
-  cudf_io::table_metadata metadata{};
-
-  if (not names.empty()) {
-    metadata.column_names = names;
-  } else {
-    // generate some dummy column names
-    int i                  = 0;
-    auto const num_columns = table.num_columns();
-    metadata.column_names.reserve(num_columns);
-    std::generate_n(std::back_inserter(metadata.column_names), num_columns, [&i]() {
-      return std::string("col") + std::to_string(i++);
-    });
-  }
+  cudf::io::csv_writer_options writer_options =
+    cudf::io::csv_writer_options::builder(cudf::io::sink_info(filename), table)
+      .include_header(not names.empty())
+      .names(names);
 
-  cudf_io::csv_writer_options writer_options =
-    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filename), table)
-      .include_header(include_header)
-      .rows_per_chunk(
-        1)  // Note: this gets adjusted to multiple of 8 (per legacy code logic and requirements)
-      .metadata(&metadata);
-
-  cudf_io::write_csv(writer_options);
+  cudf::io::write_csv(writer_options);
 }
 
 template <typename T>
@@ -352,9 +332,9 @@ TYPED_TEST(CsvReaderNumericTypeTest, SingleColumn)
     std::copy(sequence, sequence + num_rows, output_iterator);
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}).header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   expect_column_data_equal(std::vector<TypeParam>(sequence, sequence + num_rows), view.column(0));
@@ -407,11 +387,11 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnNegativeScale)
 
   auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnNegativeScale.csv";
 
-  cudf_io::csv_writer_options writer_options =
-    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table)
+  cudf::io::csv_writer_options writer_options =
+    cudf::io::csv_writer_options::builder(cudf::io::sink_info(filepath), input_table)
       .include_header(false);
 
-  cudf_io::write_csv(writer_options);
+  cudf::io::write_csv(writer_options);
 
   std::vector<std::string> result_strings;
   result_strings.reserve(reference_strings.size());
@@ -454,11 +434,11 @@ TYPED_TEST(CsvFixedPointWriterTest, SingleColumnPositiveScale)
 
   auto filepath = temp_env->get_temp_dir() + "FixedPointSingleColumnPositiveScale.csv";
 
-  cudf_io::csv_writer_options writer_options =
-    cudf_io::csv_writer_options::builder(cudf_io::sink_info(filepath), input_table)
+  cudf::io::csv_writer_options writer_options =
+    cudf::io::csv_writer_options::builder(cudf::io::sink_info(filepath), input_table)
       .include_header(false);
 
-  cudf_io::write_csv(writer_options);
+  cudf::io::write_csv(writer_options);
 
   std::vector<std::string> result_strings;
   result_strings.reserve(reference_strings.size());
@@ -500,8 +480,8 @@ TEST_F(CsvReaderTest, MultiColumn)
     outfile << line.str();
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .header(-1)
       .dtypes({dtype<int8_t>(),
                dtype<int16_t>(),
@@ -513,7 +493,7 @@ TEST_F(CsvReaderTest, MultiColumn)
                dtype<uint64_t>(),
                dtype<float>(),
                dtype<double>()});
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   expect_column_data_equal(int8_values, view.column(0));
@@ -548,14 +528,14 @@ TEST_F(CsvReaderTest, RepeatColumn)
   }
 
   // repeats column in indexes and names, misses 1 column.
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<int16_t>(), dtype<int64_t>(), dtype<uint64_t>(), dtype<float>()})
       .names({"A", "B", "C", "D"})
       .use_cols_indexes({1, 0, 0})
       .use_cols_names({"D", "B", "B"})
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(3, view.num_columns());
@@ -573,14 +553,14 @@ TEST_F(CsvReaderTest, Booleans)
                "true\nYes,5,foo,false\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A", "B", "C", "D"})
       .dtypes({dtype<int32_t>(), dtype<int32_t>(), dtype<int16_t>(), dtype<bool>()})
       .true_values({"yes", "Yes", "YES", "foo", "FOO"})
       .false_values({"no", "No", "NO", "Bar", "bar"})
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   // Booleans are the same (integer) data type, but valued at 0 or 1
   const auto view = result.tbl->view();
@@ -605,13 +585,13 @@ TEST_F(CsvReaderTest, Dates)
     outfile << "16/09/2005T1:2:30.400PM\n2/2/1970\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .dayfirst(true)
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -641,13 +621,13 @@ TEST_F(CsvReaderTest, DatesCastToTimestampSeconds)
     outfile << "16/09/2005T1:2:30.400PM\n2/2/1970\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({data_type{type_id::TIMESTAMP_SECONDS}})
       .dayfirst(true)
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -677,13 +657,13 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMilliSeconds)
     outfile << "16/09/2005T1:2:30.400PM\n2/2/1970\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .dayfirst(true)
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -713,13 +693,13 @@ TEST_F(CsvReaderTest, DatesCastToTimestampMicroSeconds)
     outfile << "16/09/2005T1:2:30.400PM\n2/2/1970\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}})
       .dayfirst(true)
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -749,13 +729,13 @@ TEST_F(CsvReaderTest, DatesCastToTimestampNanoSeconds)
     outfile << "16/09/2005T1:2:30.400PM\n2/2/1970\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}})
       .dayfirst(true)
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -789,12 +769,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampSeconds)
     }
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({data_type{type_id::TIMESTAMP_SECONDS}})
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -817,12 +797,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMilliSeconds)
     }
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -845,12 +825,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampMicroSeconds)
     }
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({data_type{type_id::TIMESTAMP_MICROSECONDS}})
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -873,12 +853,12 @@ TEST_F(CsvReaderTest, IntegersCastToTimestampNanoSeconds)
     }
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({data_type{type_id::TIMESTAMP_NANOSECONDS}})
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -897,13 +877,13 @@ TEST_F(CsvReaderTest, FloatingPoint)
                "98007199999998;";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({dtype<float>()})
       .lineterminator(';')
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -924,18 +904,18 @@ TEST_F(CsvReaderTest, Strings)
   auto filepath = temp_env->get_temp_dir() + "Strings.csv";
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    outfile << names[0] << ',' << names[1] << ',' << '\n';
+    outfile << names[0] << ',' << names[1] << '\n';
     outfile << "10,abc def ghi" << '\n';
     outfile << "20,\"jkl mno pqr\"" << '\n';
     outfile << "30,stu \"\"vwx\"\" yz" << '\n';
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names(names)
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
-      .quoting(cudf_io::quote_style::NONE);
-  auto result = cudf_io::read_csv(in_opts);
+      .quoting(cudf::io::quote_style::NONE);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
@@ -954,18 +934,18 @@ TEST_F(CsvReaderTest, StringsQuotes)
   auto filepath = temp_env->get_temp_dir() + "StringsQuotes.csv";
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    outfile << names[0] << ',' << names[1] << ',' << '\n';
+    outfile << names[0] << ',' << names[1] << '\n';
     outfile << "10,`abc,\ndef, ghi`" << '\n';
     outfile << "20,`jkl, ``mno``, pqr`" << '\n';
     outfile << "30,stu `vwx` yz" << '\n';
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names(names)
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
       .quotechar('`');
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
@@ -983,19 +963,19 @@ TEST_F(CsvReaderTest, StringsQuotesIgnored)
   auto filepath = temp_env->get_temp_dir() + "StringsQuotesIgnored.csv";
   {
     std::ofstream outfile(filepath, std::ofstream::out);
-    outfile << names[0] << ',' << names[1] << ',' << '\n';
+    outfile << names[0] << ',' << names[1] << '\n';
     outfile << "10,\"abcdef ghi\"" << '\n';
     outfile << "20,\"jkl \"\"mno\"\" pqr\"" << '\n';
     outfile << "30,stu \"vwx\" yz" << '\n';
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names(names)
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
-      .quoting(cudf_io::quote_style::NONE)
+      .quoting(cudf::io::quote_style::NONE)
       .doublequote(false);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(2, view.num_columns());
@@ -1015,14 +995,14 @@ TEST_F(CsvReaderTest, SkiprowsNrows)
     outfile << "1\n2\n3\n4\n5\n6\n7\n8\n9\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({dtype<int32_t>()})
       .header(1)
       .skiprows(2)
       .nrows(2);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -1039,14 +1019,14 @@ TEST_F(CsvReaderTest, ByteRange)
     outfile << "1000\n2000\n3000\n4000\n5000\n6000\n7000\n8000\n9000\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({dtype<int32_t>()})
       .header(-1)
       .byte_range_offset(11)
       .byte_range_size(15);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -1058,13 +1038,13 @@ TEST_F(CsvReaderTest, ByteRange)
 TEST_F(CsvReaderTest, ByteRangeStrings)
 {
   std::string input = "\"a\"\n\"b\"\n\"c\"";
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{input.c_str(), input.size()})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{input.c_str(), input.size()})
       .names({"A"})
       .dtypes({dtype<cudf::string_view>()})
       .header(-1)
       .byte_range_offset(4);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -1081,13 +1061,13 @@ TEST_F(CsvReaderTest, BlanksAndComments)
     outfile << "1\n#blank\n3\n4\n5\n#blank\n\n\n8\n9\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({dtype<int32_t>()})
       .header(-1)
       .comment('#');
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -1104,9 +1084,9 @@ TEST_F(CsvReaderTest, EmptyFile)
     outfile << "";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_csv(in_opts);
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
@@ -1120,9 +1100,9 @@ TEST_F(CsvReaderTest, NoDataFile)
     outfile << "\n\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_csv(in_opts);
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
@@ -1136,9 +1116,9 @@ TEST_F(CsvReaderTest, HeaderOnlyFile)
     outfile << "\"a\",\"b\",\"c\"\n\n";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_csv(in_opts);
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(0, view.num_rows());
@@ -1156,11 +1136,11 @@ TEST_F(CsvReaderTest, ArrowFileSource)
   std::shared_ptr<arrow::io::ReadableFile> infile;
   ASSERT_TRUE(arrow::io::ReadableFile::Open(filepath).Value(&infile).ok());
 
-  auto arrow_source = cudf_io::arrow_io_source{infile};
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{&arrow_source})
+  auto arrow_source = cudf::io::arrow_io_source{infile};
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{&arrow_source})
       .dtypes({dtype<int8_t>()});
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -1177,12 +1157,12 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
     outfile << "1.2e1+\n3.4e2-\n5.6e3e\n7.8e3A\n9.0Be1\n1C.2";
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({dtype<float>()})
       .header(-1);
-  const auto result = cudf_io::read_csv(in_opts);
+  const auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(1, view.num_columns());
@@ -1199,10 +1179,10 @@ TEST_F(CsvReaderTest, InvalidFloatingPoint)
 TEST_F(CsvReaderTest, StringInference)
 {
   std::string buffer = "\"-1\"\n";
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .header(-1);
-  const auto result = cudf_io::read_csv(in_opts);
+  const auto result = cudf::io::read_csv(in_opts);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), type_id::STRING);
@@ -1211,11 +1191,11 @@ TEST_F(CsvReaderTest, StringInference)
 TEST_F(CsvReaderTest, TypeInferenceThousands)
 {
   std::string buffer = "1`400,123,1`234.56\n123`456,123456,12.34";
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .header(-1)
       .thousands('`');
-  const auto result      = cudf_io::read_csv(in_opts);
+  const auto result      = cudf::io::read_csv(in_opts);
   const auto result_view = result.tbl->view();
 
   EXPECT_EQ(result_view.num_columns(), 3);
@@ -1238,12 +1218,12 @@ TEST_F(CsvReaderTest, TypeInferenceWithDecimal)
   // col#1 => STRING (contains digits and period character, which is NOT the decimal point here)
   // col#2 => FLOAT64 (column contains digits and decimal point (i.e., ';'))
   std::string buffer = "1`400,1.23,1`234;56\n123`456,123.456,12;34";
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .header(-1)
       .thousands('`')
       .decimal(';');
-  const auto result      = cudf_io::read_csv(in_opts);
+  const auto result      = cudf::io::read_csv(in_opts);
   const auto result_view = result.tbl->view();
 
   EXPECT_EQ(result_view.num_columns(), 3);
@@ -1263,17 +1243,17 @@ TEST_F(CsvReaderTest, SkipRowsXorSkipFooter)
 {
   std::string buffer = "1,2,3";
 
-  cudf_io::csv_reader_options skiprows_options =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+  cudf::io::csv_reader_options skiprows_options =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .header(-1)
       .skiprows(1);
-  EXPECT_NO_THROW(cudf_io::read_csv(skiprows_options));
+  EXPECT_NO_THROW(cudf::io::read_csv(skiprows_options));
 
-  cudf_io::csv_reader_options skipfooter_options =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+  cudf::io::csv_reader_options skipfooter_options =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .header(-1)
       .skipfooter(1);
-  EXPECT_NO_THROW(cudf_io::read_csv(skipfooter_options));
+  EXPECT_NO_THROW(cudf::io::read_csv(skipfooter_options));
 }
 
 TEST_F(CsvReaderTest, nullHandling)
@@ -1286,13 +1266,13 @@ TEST_F(CsvReaderTest, nullHandling)
 
   // Test disabling na_filter
   {
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
         .na_filter(false)
         .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
-    const auto result = cudf_io::read_csv(in_opts);
+    const auto result = cudf::io::read_csv(in_opts);
     const auto view   = result.tbl->view();
     auto expect =
       cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"});
@@ -1301,12 +1281,12 @@ TEST_F(CsvReaderTest, nullHandling)
 
   // Test enabling na_filter
   {
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
         .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
-    const auto result = cudf_io::read_csv(in_opts);
+    const auto result = cudf::io::read_csv(in_opts);
     const auto view   = result.tbl->view();
     auto expect =
       cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
@@ -1317,13 +1297,13 @@ TEST_F(CsvReaderTest, nullHandling)
 
   // Setting na_values with default values
   {
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
         .na_values({"Null"})
         .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
-    const auto result = cudf_io::read_csv(in_opts);
+    const auto result = cudf::io::read_csv(in_opts);
     const auto view   = result.tbl->view();
     auto expect =
       cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
@@ -1334,14 +1314,14 @@ TEST_F(CsvReaderTest, nullHandling)
 
   // Setting na_values without default values
   {
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
         .keep_default_na(false)
         .na_values({"Null"})
         .dtypes({dtype<cudf::string_view>()})
         .header(-1)
         .skip_blank_lines(false);
-    const auto result = cudf_io::read_csv(in_opts);
+    const auto result = cudf::io::read_csv(in_opts);
     const auto view   = result.tbl->view();
     auto expect =
       cudf::test::strings_column_wrapper({"NULL", "", "null", "n/a", "Null", "NA", "nan"},
@@ -1356,14 +1336,14 @@ TEST_F(CsvReaderTest, FailCases)
   std::string buffer = "1,2,3";
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .byte_range_offset(4)
         .skiprows(1),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .byte_range_offset(4)
         .skipfooter(1),
       cudf::logic_error);
@@ -1371,77 +1351,77 @@ TEST_F(CsvReaderTest, FailCases)
 
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .byte_range_offset(4)
         .nrows(1),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .byte_range_size(4)
         .skiprows(1),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .byte_range_size(4)
         .skipfooter(1),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .byte_range_size(4)
         .nrows(1),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .skiprows(1)
         .byte_range_offset(4),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .skipfooter(1)
         .byte_range_offset(4),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .nrows(1)
         .byte_range_offset(4),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .skiprows(1)
         .byte_range_size(4),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .skipfooter(1)
         .byte_range_size(4),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .nrows(1)
         .byte_range_size(4),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .nrows(1)
         .skipfooter(1),
       cudf::logic_error);
@@ -1449,14 +1429,14 @@ TEST_F(CsvReaderTest, FailCases)
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .skipfooter(1)
         .nrows(1),
       cudf::logic_error);
   }
   {
     EXPECT_THROW(
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
         .na_filter(false)
         .na_values({"Null"}),
       cudf::logic_error);
@@ -1472,13 +1452,13 @@ TEST_F(CsvReaderTest, HexTest)
   }
   // specify hex columns by name
   {
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
         .names({"A"})
         .dtypes({dtype<int64_t>()})
         .header(-1)
         .parse_hex({"A"});
-    auto result = cudf_io::read_csv(in_opts);
+    auto result = cudf::io::read_csv(in_opts);
 
     expect_column_data_equal(
       std::vector<int64_t>{0, -4096, 16702650, 11259375, 11259375, 2501034507},
@@ -1487,13 +1467,13 @@ TEST_F(CsvReaderTest, HexTest)
 
   // specify hex columns by index
   {
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
         .names({"A"})
         .dtypes({dtype<int64_t>()})
         .header(-1)
         .parse_hex(std::vector<int>{0});
-    auto result = cudf_io::read_csv(in_opts);
+    auto result = cudf::io::read_csv(in_opts);
 
     expect_column_data_equal(
       std::vector<int64_t>{0, -4096, 16702650, 11259375, 11259375, 2501034507},
@@ -1511,11 +1491,11 @@ TYPED_TEST(CsvReaderNumericTypeTest, SingleColumnWithWriter)
 
   auto filepath = temp_env->get_temp_filepath("SingleColumnWithWriter.csv");
 
-  write_csv_helper(filepath, input_table, false);
+  write_csv_helper(filepath, input_table);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}).header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result_table);
@@ -1579,10 +1559,10 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter)
 
   auto filepath = temp_env->get_temp_dir() + "MultiColumnWithWriter.csv";
 
-  write_csv_helper(filepath, input_table, false);
+  write_csv_helper(filepath, input_table);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .header(-1)
       .dtypes({dtype<int8_t>(),
                dtype<int16_t>(),
@@ -1594,7 +1574,7 @@ TEST_F(CsvReaderTest, MultiColumnWithWriter)
                dtype<uint64_t>(),
                dtype<float>(),
                dtype<double>()});
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
 
@@ -1627,15 +1607,15 @@ TEST_F(CsvReaderTest, DatesWithWriter)
   cudf::table_view input_table(std::vector<cudf::column_view>{input_column});
 
   // TODO need to add a dayfirst flag?
-  write_csv_helper(filepath, input_table, false);
+  write_csv_helper(filepath, input_table);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .dayfirst(true)
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
 
@@ -1652,11 +1632,13 @@ TEST_F(CsvReaderTest, DatesStringWithWriter)
 
     cudf::table_view input_table(std::vector<cudf::column_view>{input_column});
 
-    write_csv_helper(filepath, input_table, false);
+    write_csv_helper(filepath, input_table);
 
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}).names({"A"}).header(-1);
-    auto result = cudf_io::read_csv(in_opts);
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
+        .names({"A"})
+        .header(-1);
+    auto result = cudf::io::read_csv(in_opts);
 
     const auto result_table = result.tbl->view();
 
@@ -1673,11 +1655,13 @@ TEST_F(CsvReaderTest, DatesStringWithWriter)
 
     cudf::table_view input_table(std::vector<cudf::column_view>{input_column});
 
-    write_csv_helper(filepath, input_table, false);
+    write_csv_helper(filepath, input_table);
 
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}).names({"A"}).header(-1);
-    auto result = cudf_io::read_csv(in_opts);
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
+        .names({"A"})
+        .header(-1);
+    auto result = cudf::io::read_csv(in_opts);
 
     const auto result_table = result.tbl->view();
 
@@ -1694,11 +1678,13 @@ TEST_F(CsvReaderTest, DatesStringWithWriter)
 
     cudf::table_view input_table(std::vector<cudf::column_view>{input_column});
 
-    write_csv_helper(filepath, input_table, false);
+    write_csv_helper(filepath, input_table);
 
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}).names({"A"}).header(-1);
-    auto result = cudf_io::read_csv(in_opts);
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
+        .names({"A"})
+        .header(-1);
+    auto result = cudf::io::read_csv(in_opts);
 
     const auto result_table = result.tbl->view();
 
@@ -1716,11 +1702,13 @@ TEST_F(CsvReaderTest, DatesStringWithWriter)
 
     cudf::table_view input_table(std::vector<cudf::column_view>{input_column});
 
-    write_csv_helper(filepath, input_table, false);
+    write_csv_helper(filepath, input_table);
 
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}).names({"A"}).header(-1);
-    auto result = cudf_io::read_csv(in_opts);
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
+        .names({"A"})
+        .header(-1);
+    auto result = cudf::io::read_csv(in_opts);
 
     const auto result_table = result.tbl->view();
 
@@ -1737,11 +1725,13 @@ TEST_F(CsvReaderTest, DatesStringWithWriter)
 
     cudf::table_view input_table(std::vector<cudf::column_view>{input_column});
 
-    write_csv_helper(filepath, input_table, false);
+    write_csv_helper(filepath, input_table);
 
-    cudf_io::csv_reader_options in_opts =
-      cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}).names({"A"}).header(-1);
-    auto result = cudf_io::read_csv(in_opts);
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
+        .names({"A"})
+        .header(-1);
+    auto result = cudf::io::read_csv(in_opts);
 
     const auto result_table = result.tbl->view();
 
@@ -1758,15 +1748,15 @@ TEST_F(CsvReaderTest, FloatingPointWithWriter)
   cudf::table_view input_table(std::vector<cudf::column_view>{input_column});
 
   // TODO add lineterminator=";"
-  write_csv_helper(filepath, input_table, false);
+  write_csv_helper(filepath, input_table);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names({"A"})
       .dtypes({dtype<double>()})
       .header(-1);
   // in_opts.lineterminator = ';';
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result_table);
@@ -1784,18 +1774,18 @@ TEST_F(CsvReaderTest, StringsWithWriter)
   cudf::table_view input_table(std::vector<cudf::column_view>{int_column, string_column});
 
   // TODO add quoting style flag?
-  write_csv_helper(filepath, input_table, true, names);
+  write_csv_helper(filepath, input_table, names);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
-      .names(names)
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
-      .quoting(cudf_io::quote_style::NONE);
-  auto result = cudf_io::read_csv(in_opts);
+      .quoting(cudf::io::quote_style::NONE);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0));
   check_string_column(input_table.column(1), result_table.column(1));
+  ASSERT_EQ(names, result.metadata.column_names);
 }
 
 TEST_F(CsvReaderTest, StringsWithWriterSimple)
@@ -1809,18 +1799,18 @@ TEST_F(CsvReaderTest, StringsWithWriterSimple)
   cudf::table_view input_table(std::vector<cudf::column_view>{int_column, string_column});
 
   // TODO add quoting style flag?
-  write_csv_helper(filepath, input_table, true, names);
+  write_csv_helper(filepath, input_table, names);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
-      .names(names)
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()})
-      .quoting(cudf_io::quote_style::NONE);
-  auto result = cudf_io::read_csv(in_opts);
+      .quoting(cudf::io::quote_style::NONE);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input_table.column(0), result_table.column(0));
   check_string_column(input_table.column(1), result_table.column(1));
+  ASSERT_EQ(names, result.metadata.column_names);
 }
 
 TEST_F(CsvReaderTest, StringsEmbeddedDelimiter)
@@ -1833,15 +1823,15 @@ TEST_F(CsvReaderTest, StringsEmbeddedDelimiter)
   auto string_column = column_wrapper<cudf::string_view>{"abc def ghi", "jkl,mno,pq", "stu vwx y"};
   cudf::table_view input_table(std::vector<cudf::column_view>{int_column, string_column});
 
-  write_csv_helper(filepath, input_table, true, names);
+  write_csv_helper(filepath, input_table, names);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
-      .names(names)
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()});
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
+  ASSERT_EQ(names, result.metadata.column_names);
 }
 
 TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
@@ -1856,19 +1846,20 @@ TEST_F(CsvReaderTest, HeaderEmbeddedDelimiter)
   cudf::table_view input_table(
     std::vector<cudf::column_view>{int_column, string_column, int_column, int_column, int_column});
 
-  write_csv_helper(filepath, input_table, true, names);
+  write_csv_helper(filepath, input_table, names);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names(names)
       .dtypes({dtype<int32_t>(),
                dtype<cudf::string_view>(),
                dtype<int32_t>(),
                dtype<int32_t>(),
                dtype<int32_t>()});
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result.tbl->view());
+  ASSERT_EQ(names, result.metadata.column_names);
 }
 
 TEST_F(CsvReaderTest, EmptyFileWithWriter)
@@ -1876,10 +1867,10 @@ TEST_F(CsvReaderTest, EmptyFileWithWriter)
   auto filepath = temp_env->get_temp_dir() + "EmptyFileWithWriter.csv";
 
   cudf::table_view empty_table;
-  write_csv_helper(filepath, empty_table, false);
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_csv(in_opts);
+  write_csv_helper(filepath, empty_table);
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_csv(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(empty_table, result.tbl->view());
 }
@@ -1918,11 +1909,11 @@ TEST_F(CsvReaderTest, UserImplementedSource)
              << "\n";
   }
   TestSource source{csv_data.str()};
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{&source})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{&source})
       .dtypes({dtype<int8_t>(), dtype<int16_t>(), dtype<int32_t>()})
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   auto const view = result.tbl->view();
   expect_column_data_equal(int8_values, view.column(0));
@@ -1960,20 +1951,21 @@ TEST_F(CsvReaderTest, DurationsWithWriter)
     durations_D, durations_s, durations_ms, durations_us, durations_ns});
   std::vector<std::string> names{"D", "s", "ms", "us", "ns"};
 
-  write_csv_helper(filepath, input_table, true, names);
+  write_csv_helper(filepath, input_table, names);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .names(names)
       .dtypes({data_type{type_id::DURATION_DAYS},
                data_type{type_id::DURATION_SECONDS},
                data_type{type_id::DURATION_MILLISECONDS},
                data_type{type_id::DURATION_MICROSECONDS},
                data_type{type_id::DURATION_NANOSECONDS}});
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(input_table, result_table);
+  ASSERT_EQ(names, result.metadata.column_names);
 }
 
 TEST_F(CsvReaderTest, ParseInRangeIntegers)
@@ -2036,11 +2028,11 @@ TEST_F(CsvReaderTest, ParseInRangeIntegers)
 
   auto filepath = temp_env->get_temp_filepath("ParseInRangeIntegers.csv");
 
-  write_csv_helper(filepath, input_table, false);
+  write_csv_helper(filepath, input_table);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}).header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
 
@@ -2115,11 +2107,11 @@ TEST_F(CsvReaderTest, ParseOutOfRangeIntegers)
 
   auto filepath = temp_env->get_temp_filepath("ParseOutOfRangeIntegers.csv");
 
-  write_csv_helper(filepath, input_table, false);
+  write_csv_helper(filepath, input_table);
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}).header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
 
@@ -2148,9 +2140,9 @@ TEST_F(CsvReaderTest, ReadMaxNumericValue)
     std::copy(sequence, sequence + num_rows, output_iterator);
   }
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath}).header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath}).header(-1);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto view = result.tbl->view();
   expect_column_data_equal(std::vector<uint64_t>(sequence, sequence + num_rows), view.column(0));
@@ -2164,8 +2156,8 @@ TEST_F(CsvReaderTest, DefaultWriteChunkSize)
     auto input_column = column_wrapper<int32_t>(sequence, sequence + num_rows);
     auto input_table  = cudf::table_view{std::vector<cudf::column_view>{input_column}};
 
-    cudf_io::csv_writer_options opts =
-      cudf_io::csv_writer_options::builder(cudf_io::sink_info{"unused.path"}, input_table);
+    cudf::io::csv_writer_options opts =
+      cudf::io::csv_writer_options::builder(cudf::io::sink_info{"unused.path"}, input_table);
     ASSERT_EQ(num_rows, opts.get_rows_per_chunk());
   }
 }
@@ -2174,12 +2166,12 @@ TEST_F(CsvReaderTest, DtypesMap)
 {
   std::string csv_in{"12,9\n34,8\n56,7"};
 
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{csv_in.c_str(), csv_in.size()})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
       .names({"A", "B"})
       .dtypes({{"B", dtype<int16_t>()}, {"A", dtype<int32_t>()}})
       .header(-1);
-  auto result = cudf_io::read_csv(in_opts);
+  auto result = cudf::io::read_csv(in_opts);
 
   const auto result_table = result.tbl->view();
   ASSERT_EQ(result_table.num_columns(), 2);
@@ -2191,12 +2183,12 @@ TEST_F(CsvReaderTest, DtypesMap)
 
 TEST_F(CsvReaderTest, DtypesMapPartial)
 {
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{nullptr, 0})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{nullptr, 0})
       .names({"A", "B"})
       .dtypes({{"A", dtype<int16_t>()}});
   {
-    auto result = cudf_io::read_csv(in_opts);
+    auto result = cudf::io::read_csv(in_opts);
 
     const auto view = result.tbl->view();
     ASSERT_EQ(type_id::INT16, view.column(0).type().id());
@@ -2206,7 +2198,7 @@ TEST_F(CsvReaderTest, DtypesMapPartial)
 
   in_opts.set_dtypes({{"B", dtype<uint32_t>()}});
   {
-    auto result = cudf_io::read_csv(in_opts);
+    auto result = cudf::io::read_csv(in_opts);
 
     const auto view = result.tbl->view();
     ASSERT_EQ(type_id::STRING, view.column(0).type().id());
@@ -2216,12 +2208,12 @@ TEST_F(CsvReaderTest, DtypesMapPartial)
 
 TEST_F(CsvReaderTest, DtypesArrayInvalid)
 {
-  cudf_io::csv_reader_options in_opts =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{nullptr, 0})
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{nullptr, 0})
       .names({"A", "B", "C"})
       .dtypes(std::vector<cudf::data_type>{dtype<int16_t>(), dtype<int8_t>()});
 
-  EXPECT_THROW(cudf_io::read_csv(in_opts), cudf::logic_error);
+  EXPECT_THROW(cudf::io::read_csv(in_opts), cudf::logic_error);
 }
 
 TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch)
@@ -2234,16 +2226,16 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch)
   cudf::table_view input_table(std::vector<cudf::column_view>{int_column, str_column});
 
   // write that dataframe to a csv using default options to some temporary file
-  cudf_io::csv_writer_options writer_options =
-    cudf_io::csv_writer_options::builder(cudf_io::sink_info{filepath}, input_table);
-  cudf_io::write_csv(writer_options);
+  cudf::io::csv_writer_options writer_options =
+    cudf::io::csv_writer_options::builder(cudf::io::sink_info{filepath}, input_table);
+  cudf::io::write_csv(writer_options);
 
   // read the temp csv file using default options
-  cudf_io::csv_reader_options read_options =
-    cudf_io::csv_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::csv_reader_options read_options =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<cudf::string_view>()});
 
-  cudf_io::table_with_metadata new_table_and_metadata = cudf_io::read_csv(read_options);
+  cudf::io::table_with_metadata new_table_and_metadata = cudf::io::read_csv(read_options);
 
   // verify that the tables are identical, or as identical as expected.
   const auto new_table_view = new_table_and_metadata.tbl->view();
@@ -2252,4 +2244,173 @@ TEST_F(CsvReaderTest, CsvDefaultOptionsWriteReadMatch)
   EXPECT_EQ(new_table_and_metadata.metadata.column_names[1], "1");
 }
 
+TEST_F(CsvReaderTest, UseColsValidation)
+{
+  const std::string buffer = "1,2,3";
+
+  const cudf::io::csv_reader_options idx_cnt_options =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
+      .names({"a", "b"})
+      .use_cols_indexes({0});
+  EXPECT_THROW(cudf::io::read_csv(idx_cnt_options), cudf::logic_error);
+
+  cudf::io::csv_reader_options unique_idx_cnt_options =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
+      .names({"a", "b"})
+      .use_cols_indexes({0, 0});
+  EXPECT_THROW(cudf::io::read_csv(unique_idx_cnt_options), cudf::logic_error);
+
+  cudf::io::csv_reader_options bad_name_options =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
+      .names({"a", "b", "c"})
+      .use_cols_names({"nonexistent_name"});
+  EXPECT_THROW(cudf::io::read_csv(bad_name_options), cudf::logic_error);
+}
+
+TEST_F(CsvReaderTest, CropColumns)
+{
+  const std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};
+
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<float>()})
+      .names({"a", "b"})
+      .header(-1);
+  const auto result = cudf::io::read_csv(in_opts);
+
+  const auto result_table = result.tbl->view();
+  ASSERT_EQ(result_table.num_columns(), 2);
+  ASSERT_EQ(result_table.column(0).type(), data_type{type_id::INT32});
+  ASSERT_EQ(result_table.column(1).type(), data_type{type_id::FLOAT32});
+  expect_column_data_equal(std::vector<int32_t>{12, 34, 56}, result_table.column(0));
+  expect_column_data_equal(std::vector<float>{9., 8., 7.}, result_table.column(1));
+}
+
+TEST_F(CsvReaderTest, CropColumnsUseColsNames)
+{
+  std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};
+
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+      .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<float>()})
+      .names({"a", "b"})
+      .use_cols_names({"b"})
+      .header(-1);
+  auto result = cudf::io::read_csv(in_opts);
+
+  const auto result_table = result.tbl->view();
+  ASSERT_EQ(result_table.num_columns(), 1);
+  ASSERT_EQ(result_table.column(0).type(), data_type{type_id::FLOAT32});
+  expect_column_data_equal(std::vector<float>{9., 8., 7.}, result_table.column(0));
+}
+
+TEST_F(CsvReaderTest, ExtraColumns)
+{
+  std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};
+  {
+    cudf::io::csv_reader_options opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+        .names({"a", "b", "c", "d"})
+        .header(-1);
+    auto result = cudf::io::read_csv(opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 4);
+    ASSERT_EQ(result_table.column(3).type(), data_type{type_id::INT8});
+    ASSERT_EQ(result_table.column(3).null_count(), 3);
+  }
+  {
+    cudf::io::csv_reader_options with_dtypes_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+        .names({"a", "b", "c", "d"})
+        .dtypes({dtype<int32_t>(), dtype<int32_t>(), dtype<int32_t>(), dtype<float>()})
+        .header(-1);
+    auto result = cudf::io::read_csv(with_dtypes_opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 4);
+    ASSERT_EQ(result_table.column(3).type(), data_type{type_id::FLOAT32});
+    ASSERT_EQ(result_table.column(3).null_count(), 3);
+  }
+}
+
+TEST_F(CsvReaderTest, ExtraColumnsUseCols)
+{
+  std::string csv_in{"12,9., 10\n34,8., 20\n56,7., 30"};
+
+  {
+    cudf::io::csv_reader_options in_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+        .names({"a", "b", "c", "d"})
+        .use_cols_names({"b", "d"})
+        .header(-1);
+    auto result = cudf::io::read_csv(in_opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 2);
+    ASSERT_EQ(result_table.column(1).type(), data_type{type_id::INT8});
+    ASSERT_EQ(result_table.column(1).null_count(), 3);
+  }
+  {
+    cudf::io::csv_reader_options with_dtypes_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+        .names({"a", "b", "c", "d"})
+        .use_cols_names({"b", "d"})
+        .dtypes({dtype<int32_t>(), dtype<int32_t>(), dtype<int32_t>(), dtype<cudf::string_view>()})
+        .header(-1);
+    auto result = cudf::io::read_csv(with_dtypes_opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 2);
+    ASSERT_EQ(result_table.column(1).type(), data_type{type_id::STRING});
+    ASSERT_EQ(result_table.column(1).null_count(), 3);
+  }
+}
+
+TEST_F(CsvReaderTest, EmptyColumns)
+{
+  // First column only has empty fields. second column contains only "null" literals
+  std::string csv_in{",null\n,null"};
+
+  cudf::io::csv_reader_options in_opts =
+    cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+      .names({"a", "b", "c", "d"})
+      .header(-1);
+  // More elements in `names` than in the file; additional columns are filled with nulls
+  auto result = cudf::io::read_csv(in_opts);
+
+  const auto result_table = result.tbl->view();
+  EXPECT_EQ(result_table.num_columns(), 4);
+  // All columns should contain only nulls; expect INT8 type to use as little memory as possible
+  for (auto& column : result_table) {
+    EXPECT_EQ(column.type(), data_type{type_id::INT8});
+    EXPECT_EQ(column.null_count(), 2);
+  }
+}
+
+TEST_F(CsvReaderTest, BlankLineAfterFirstRow)
+{
+  std::string csv_in{"12,9., 10\n\n"};
+
+  {
+    cudf::io::csv_reader_options no_header_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()})
+        .header(-1);
+    // No header, getting column names/count from first row
+    auto result = cudf::io::read_csv(no_header_opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 3);
+  }
+  {
+    cudf::io::csv_reader_options header_opts =
+      cudf::io::csv_reader_options::builder(cudf::io::source_info{csv_in.c_str(), csv_in.size()});
+    // Getting column names/count from header
+    auto result = cudf::io::read_csv(header_opts);
+
+    const auto result_table = result.tbl->view();
+    ASSERT_EQ(result_table.num_columns(), 3);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_chunked_reader.cpp b/cpp/tests/io/json_chunked_reader.cpp
new file mode 100644
index 00000000000..28b41c5691f
--- /dev/null
+++ b/cpp/tests/io/json_chunked_reader.cpp
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/table_utilities.hpp>
+
+#include <io/json/experimental/read_json.hpp>
+
+/**
+ * @brief Base test fixture for JSON reader tests
+ */
+struct JsonReaderTest : public cudf::test::BaseFixture {
+};
+
+// function to extract first delimiter in the string in each chunk,
+// collate together and form byte_range for each chunk,
+// parse separately.
+std::vector<cudf::io::table_with_metadata> skeleton_for_parellel_chunk_reader(
+  cudf::host_span<std::unique_ptr<cudf::io::datasource>> sources,
+  cudf::io::json_reader_options const& reader_opts,
+  int32_t chunk_size,
+  rmm::cuda_stream_view stream,
+  rmm::mr::device_memory_resource* mr)
+{
+  using namespace cudf::io::detail::json::experimental;
+  using cudf::size_type;
+  // assuming single source.
+  size_t total_source_size = 0;
+  for (auto const& source : sources) {
+    total_source_size += source->size();
+  }
+  size_t num_chunks                = (total_source_size + chunk_size - 1) / chunk_size;
+  constexpr size_type no_min_value = -1;
+
+  // Get the first delimiter in each chunk.
+  std::vector<size_type> first_delimiter_index(num_chunks);
+  auto reader_opts_chunk = reader_opts;
+  for (size_t i = 0; i < num_chunks; i++) {
+    auto const chunk_start = i * chunk_size;
+    reader_opts_chunk.set_byte_range_offset(chunk_start);
+    reader_opts_chunk.set_byte_range_size(chunk_size);
+    first_delimiter_index[i] =
+      find_first_delimiter_in_chunk(sources, reader_opts_chunk, '\n', stream);
+    if (first_delimiter_index[i] != no_min_value) { first_delimiter_index[i] += chunk_start; }
+  }
+
+  // Process and allocate record start, end for each worker.
+  using record_range = std::pair<size_type, size_type>;
+  std::vector<record_range> record_ranges;
+  record_ranges.reserve(num_chunks);
+  first_delimiter_index[0] = 0;
+  auto prev                = first_delimiter_index[0];
+  for (size_t i = 1; i < num_chunks; i++) {
+    if (first_delimiter_index[i] == no_min_value) continue;
+    record_ranges.push_back({prev, first_delimiter_index[i]});
+    prev = first_delimiter_index[i];
+  }
+  record_ranges.push_back({prev, total_source_size});
+
+  std::vector<cudf::io::table_with_metadata> tables;
+  // Process each chunk in parallel.
+  for (auto const [chunk_start, chunk_end] : record_ranges) {
+    if (chunk_start == -1 or chunk_end == -1) continue;
+    reader_opts_chunk.set_byte_range_offset(chunk_start);
+    reader_opts_chunk.set_byte_range_size(chunk_end - chunk_start);
+    tables.push_back(read_json(sources, reader_opts_chunk, stream, mr));
+  }
+  // assume all records have same number of columns, and inferred same type. (or schema is passed)
+  // TODO a step before to merge all columns, types and infer final schema.
+  return tables;
+}
+
+TEST_F(JsonReaderTest, ByteRange)
+{
+  std::string const json_string = R"(
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 })";
+
+  // Initialize parsing options (reading json lines)
+  cudf::io::json_reader_options json_lines_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{json_string.c_str(), json_string.size()})
+      .compression(cudf::io::compression_type::NONE)
+      .lines(true)
+      .experimental(true);
+
+  // Read full test data via existing, nested JSON lines reader
+  cudf::io::table_with_metadata current_reader_table = cudf::io::read_json(json_lines_options);
+
+  auto datasources = cudf::io::datasource::create(json_lines_options.get_source().buffers());
+
+  // Test for different chunk sizes
+  for (auto chunk_size : {7, 10, 15, 20, 40, 50, 100, 200, 500}) {
+    auto const tables = skeleton_for_parellel_chunk_reader(datasources,
+                                                           json_lines_options,
+                                                           chunk_size,
+                                                           cudf::get_default_stream(),
+                                                           rmm::mr::get_current_device_resource());
+
+    auto table_views = std::vector<cudf::table_view>(tables.size());
+    std::transform(tables.begin(), tables.end(), table_views.begin(), [](auto& table) {
+      return table.tbl->view();
+    });
+    auto result = cudf::concatenate(table_views);
+
+    // Verify that the data read via chunked reader matches the data read via nested JSON reader
+    // cannot use EQUAL due to concatenate removing null mask
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(current_reader_table.tbl->view(), result->view());
+  }
+}
diff --git a/cpp/tests/io/json_test.cpp b/cpp/tests/io/json_test.cpp
index 5a0db6e3c64..f7b21008f70 100644
--- a/cpp/tests/io/json_test.cpp
+++ b/cpp/tests/io/json_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
@@ -29,6 +30,7 @@
 #include <cudf/table/table_view.hpp>
 #include <cudf/types.hpp>
 
+#include <limits>
 #include <thrust/iterator/constant_iterator.h>
 
 #include <arrow/io/api.h>
@@ -62,8 +64,6 @@ using column_wrapper =
                             cudf::test::strings_column_wrapper,
                             cudf::test::fixed_width_column_wrapper<T, SourceElementT>>::type;
 
-namespace cudf_io = cudf::io;
-
 cudf::test::TempDirTestEnvironment* const temp_env =
   static_cast<cudf::test::TempDirTestEnvironment*>(
     ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
@@ -239,12 +239,12 @@ TEST_P(JsonReaderParamTest, BasicJsonLines)
     {{{"0", "1"}, {"1", "1.1"}}, {{"0", "2"}, {"1", "2.2"}}, {{"0", "3"}, {"1", "3.3"}}}, "\n");
   std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient;
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(std::vector<data_type>{dtype<int32_t>(), dtype<double>()})
       .lines(true)
       .experimental(test_experimental);
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 2);
   EXPECT_EQ(result.tbl->num_rows(), 3);
@@ -286,13 +286,13 @@ TEST_P(JsonReaderParamTest, FloatingPoint)
     outfile << data;
   }
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<float>()})
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT32);
@@ -318,13 +318,13 @@ TEST_P(JsonReaderParamTest, JsonLinesStrings)
                                                 "\n");
   std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient;
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes({{"2", dtype<cudf::string_view>()}, {"0", dtype<int32_t>()}, {"1", dtype<double>()}})
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 3);
   EXPECT_EQ(result.tbl->num_rows(), 2);
@@ -386,8 +386,8 @@ TEST_P(JsonReaderParamTest, MultiColumn)
     outfile << line.str();
   }
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<int8_t>(),
                dtype<int16_t>(),
                dtype<int32_t>(),
@@ -396,7 +396,7 @@ TEST_P(JsonReaderParamTest, MultiColumn)
                dtype<double>()})
       .lines(true)
       .experimental(test_experimental);
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   auto validity = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return true; });
 
@@ -443,12 +443,12 @@ TEST_P(JsonReaderParamTest, Booleans)
     outfile << data;
   }
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<bool>()})
       .lines(true)
       .experimental(test_experimental);
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   // Booleans are the same (integer) data type, but valued at 0 or 1
   const auto view = result.tbl->view();
@@ -488,13 +488,13 @@ TEST_P(JsonReaderParamTest, Dates)
     outfile << data;
   }
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({data_type{type_id::TIMESTAMP_MILLISECONDS}})
       .lines(true)
       .dayfirst(true)
       .experimental(test_experimental);
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(result.tbl->num_columns(), 1);
@@ -544,12 +544,12 @@ TEST_P(JsonReaderParamTest, Durations)
     outfile << data;
   }
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({data_type{type_id::DURATION_NANOSECONDS}})
       .lines(true)
       .experimental(test_experimental);
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(result.tbl->num_columns(), 1);
@@ -583,12 +583,12 @@ TEST_P(JsonReaderParamTest, JsonLinesDtypeInference)
                                                 "\n");
   std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient;
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 3);
   EXPECT_EQ(result.tbl->num_rows(), 2);
@@ -623,12 +623,12 @@ TEST_P(JsonReaderParamTest, JsonLinesFileInput)
   outfile << data;
   outfile.close();
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{fname})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 2);
   EXPECT_EQ(result.tbl->num_rows(), 2);
@@ -652,13 +652,13 @@ TEST_F(JsonReaderTest, JsonLinesByteRange)
   outfile << "[1000]\n[2000]\n[3000]\n[4000]\n[5000]\n[6000]\n[7000]\n[8000]\n[9000]\n";
   outfile.close();
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{fname})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
       .lines(true)
       .byte_range_offset(11)
       .byte_range_size(20);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->num_rows(), 3);
@@ -681,12 +681,12 @@ TEST_P(JsonReaderDualTest, JsonLinesObjects)
   outfile << " {\"co\\\"l1\" : 1, \"col2\" : 2.0} \n";
   outfile.close();
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{fname})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{fname})
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 2);
   EXPECT_EQ(result.tbl->num_rows(), 1);
@@ -707,12 +707,12 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsStrings)
   auto const test_opt          = GetParam();
   bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient);
   auto test_json_objects       = [test_experimental](std::string const& data) {
-    cudf_io::json_reader_options in_options =
-      cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
         .lines(true)
         .experimental(test_experimental);
 
-    cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
     EXPECT_EQ(result.tbl->num_columns(), 3);
     EXPECT_EQ(result.tbl->num_rows(), 2);
@@ -751,12 +751,12 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsMissingData)
   std::string const data =
     "{              \"col2\":1.1, \"col3\":\"aaa\"}\n"
     "{\"col1\":200,               \"col3\":\"bbb\"}\n";
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 3);
   EXPECT_EQ(result.tbl->num_rows(), 2);
@@ -790,12 +790,12 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder)
     "{\"col1\":100, \"col2\":1.1, \"col3\":\"aaa\"}\n"
     "{\"col3\":\"bbb\", \"col1\":200, \"col2\":2.2}\n";
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 3);
   EXPECT_EQ(result.tbl->num_rows(), 2);
@@ -815,7 +815,6 @@ TEST_P(JsonReaderDualTest, JsonLinesObjectsOutOfOrder)
                                  cudf::test::strings_column_wrapper({"aaa", "bbb"}));
 }
 
-/*
 // currently, the json reader is strict about having non-empty input.
 TEST_F(JsonReaderTest, EmptyFile)
 {
@@ -825,15 +824,18 @@ TEST_F(JsonReaderTest, EmptyFile)
     outfile << "";
   }
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}).lines(true);
-  auto result = cudf_io::read_json(in_options);
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+      .lines(true)
+      .experimental(true);
+  auto result = cudf::io::read_json(in_options);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
 }
 
 // currently, the json reader is strict about having non-empty input.
+// experimental reader supports empty input
 TEST_F(JsonReaderTest, NoDataFile)
 {
   auto filepath = temp_env->get_temp_dir() + "NoDataFile.csv";
@@ -842,14 +844,15 @@ TEST_F(JsonReaderTest, NoDataFile)
     outfile << "{}\n";
   }
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{filepath}).lines(true);
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
+      .lines(true)
+      .experimental(true);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   const auto view = result.tbl->view();
   EXPECT_EQ(0, view.num_columns());
 }
-*/
 
 TEST_F(JsonReaderTest, ArrowFileSource)
 {
@@ -862,13 +865,13 @@ TEST_F(JsonReaderTest, ArrowFileSource)
   std::shared_ptr<arrow::io::ReadableFile> infile;
   ASSERT_TRUE(arrow::io::ReadableFile::Open(fname).Value(&infile).ok());
 
-  auto arrow_source = cudf_io::arrow_io_source{infile};
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{&arrow_source})
+  auto arrow_source = cudf::io::arrow_io_source{infile};
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{&arrow_source})
       .dtypes({dtype<int8_t>()})
       .lines(true);
   ;
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::INT8);
@@ -899,12 +902,12 @@ TEST_P(JsonReaderParamTest, InvalidFloatingPoint)
     outfile << data;
   }
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .dtypes({dtype<float>()})
       .lines(true)
       .experimental(test_experimental);
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::FLOAT32);
@@ -925,11 +928,11 @@ TEST_P(JsonReaderParamTest, StringInference)
   std::string record_orient    = to_records_orient({{{"0", R"("-1")"}}}, "\n");
   std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient;
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{data.c_str(), data.size()})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.c_str(), data.size()})
       .lines(true)
       .experimental(test_experimental);
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 1);
   EXPECT_EQ(result.tbl->get_column(0).type().id(), cudf::type_id::STRING);
@@ -1009,12 +1012,12 @@ TEST_P(JsonReaderParamTest, ParseInRangeIntegers)
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << line.str();
   }
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   const auto view = result.tbl->view();
 
@@ -1114,12 +1117,12 @@ TEST_P(JsonReaderParamTest, ParseOutOfRangeIntegers)
     std::ofstream outfile(filepath, std::ofstream::out);
     outfile << line.str();
   }
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{filepath})
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   const auto view = result.tbl->view();
 
@@ -1155,12 +1158,12 @@ TEST_P(JsonReaderParamTest, JsonLinesMultipleFileInputs)
   outfile2 << data[1];
   outfile2.close();
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{{file1, file2}})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{{file1, file2}})
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 2);
   EXPECT_EQ(result.tbl->num_rows(), 4);
@@ -1183,23 +1186,23 @@ TEST_F(JsonReaderTest, BadDtypeParams)
 {
   std::string buffer = "[1,2,3,4]";
 
-  cudf_io::json_reader_options options_vec =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+  cudf::io::json_reader_options options_vec =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .lines(true)
       .dtypes({dtype<int8_t>()});
 
   // should throw because there are four columns and only one dtype
-  EXPECT_THROW(cudf_io::read_json(options_vec), cudf::logic_error);
+  EXPECT_THROW(cudf::io::read_json(options_vec), cudf::logic_error);
 
-  cudf_io::json_reader_options options_map =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{buffer.c_str(), buffer.size()})
+  cudf::io::json_reader_options options_map =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{buffer.c_str(), buffer.size()})
       .lines(true)
       .dtypes(std::map<std::string, cudf::data_type>{{"0", dtype<int8_t>()},
                                                      {"1", dtype<int8_t>()},
                                                      {"2", dtype<int8_t>()},
                                                      {"wrong_name", dtype<int8_t>()}});
   // should throw because one of the columns is not in the dtype map
-  EXPECT_THROW(cudf_io::read_json(options_map), cudf::logic_error);
+  EXPECT_THROW(cudf::io::read_json(options_map), cudf::logic_error);
 }
 
 TEST_F(JsonReaderTest, JsonExperimentalBasic)
@@ -1209,9 +1212,9 @@ TEST_F(JsonReaderTest, JsonExperimentalBasic)
   outfile << R"([{"a":"11", "b":"1.1"},{"a":"22", "b":"2.2"}])";
   outfile.close();
 
-  cudf_io::json_reader_options options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{fname}).experimental(true);
-  auto result = cudf_io::read_json(options);
+  cudf::io::json_reader_options options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{fname}).experimental(true);
+  auto result = cudf::io::read_json(options);
 
   EXPECT_EQ(result.tbl->num_columns(), 2);
   EXPECT_EQ(result.tbl->num_rows(), 2);
@@ -1366,15 +1369,15 @@ TEST_P(JsonReaderParamTest, JsonDtypeSchema)
 
   std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient;
 
-  std::map<std::string, cudf_io::schema_element> dtype_schema{
+  std::map<std::string, cudf::io::schema_element> dtype_schema{
     {"2", {dtype<cudf::string_view>()}}, {"0", {dtype<int32_t>()}}, {"1", {dtype<double>()}}};
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(cudf_io::source_info{data.data(), data.size()})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
       .dtypes(dtype_schema)
       .lines(true)
       .experimental(test_experimental);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   EXPECT_EQ(result.tbl->num_columns(), 3);
   EXPECT_EQ(result.tbl->num_rows(), 2);
@@ -1399,7 +1402,7 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema)
 {
   std::string json_string = R"( [{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}])";
 
-  std::map<std::string, cudf_io::schema_element> dtype_schema{
+  std::map<std::string, cudf::io::schema_element> dtype_schema{
     {"a",
      {
        data_type{cudf::type_id::LIST},
@@ -1408,14 +1411,14 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema)
     {"b", {dtype<int32_t>()}},
   };
 
-  cudf_io::json_reader_options in_options =
-    cudf_io::json_reader_options::builder(
-      cudf_io::source_info{json_string.data(), json_string.size()})
+  cudf::io::json_reader_options in_options =
+    cudf::io::json_reader_options::builder(
+      cudf::io::source_info{json_string.data(), json_string.size()})
       .dtypes(dtype_schema)
       .lines(false)
       .experimental(true);
 
-  cudf_io::table_with_metadata result = cudf_io::read_json(in_options);
+  cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
 
   // Make sure we have columns "a" and "b"
   ASSERT_EQ(result.tbl->num_columns(), 2);
@@ -1450,4 +1453,114 @@ TEST_F(JsonReaderTest, JsonNestedDtypeSchema)
                                  int_wrapper{{1, 1, 2}, {true, true, true}});
 }
 
+TEST_P(JsonReaderParamTest, JsonDtypeParsing)
+{
+  auto const test_opt          = GetParam();
+  bool const test_experimental = (test_opt == json_test_t::json_experimental_record_orient);
+  // All corner cases of dtype parsing
+  //  0, "0", " 0", 1, "1", " 1", "a", "z", null, true, false,  "null", "true", "false", nan, "nan"
+  // Test for dtypes: bool, int, float, str, duration, timestamp
+  std::string row_orient =
+    "[0]\n[\"0\"]\n[\" 0\"]\n[1]\n[\"1\"]\n[\" 1\"]\n[\"a\"]\n[\"z\"]\n"
+    "[null]\n[true]\n[false]\n[\"null\"]\n[\"true\"]\n[\"false\"]\n[nan]\n[\"nan\"]\n";
+  std::string record_orient = to_records_orient({{{"0", "0"}},
+                                                 {{"0", "\"0\""}},
+                                                 {{"0", "\" 0\""}},
+                                                 {{"0", "1"}},
+                                                 {{"0", "\"1\""}},
+                                                 {{"0", "\" 1\""}},
+                                                 {{"0", "\"a\""}},
+                                                 {{"0", "\"z\""}},
+                                                 {{"0", "null"}},
+                                                 {{"0", "true"}},
+                                                 {{"0", "false"}},
+                                                 {{"0", "\"null\""}},
+                                                 {{"0", "\"true\""}},
+                                                 {{"0", "\"false\""}},
+                                                 {{"0", "nan"}},
+                                                 {{"0", "\"nan\""}}},
+                                                "\n");
+
+  std::string data = (test_opt == json_test_t::json_lines_row_orient) ? row_orient : record_orient;
+
+  auto make_validity = [](std::vector<int> const& validity) {
+    return cudf::detail::make_counting_transform_iterator(
+      0, [=](auto i) -> bool { return static_cast<bool>(validity[i]); });
+  };
+
+  constexpr int int_NA       = 0;
+  constexpr double double_NA = std::numeric_limits<double>::quiet_NaN();
+  constexpr bool bool_NA     = false;
+
+  std::vector<int> const validity = {1, 1, 0, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0};
+
+  auto int_col = int_wrapper{
+    {0, 0, int_NA, 1, 1, int_NA, int_NA, int_NA, int_NA, 1, 0, int_NA, 1, 0, int_NA, int_NA},
+    cudf::test::iterators::nulls_at(std::vector<int>{8})};
+  auto float_col = float_wrapper{{0.0,
+                                  0.0,
+                                  double_NA,
+                                  1.0,
+                                  1.0,
+                                  double_NA,
+                                  double_NA,
+                                  double_NA,
+                                  double_NA,
+                                  1.0,
+                                  0.0,
+                                  double_NA,
+                                  1.0,
+                                  0.0,
+                                  double_NA,
+                                  double_NA},
+                                 make_validity(validity)};
+  auto str_col =
+    cudf::test::strings_column_wrapper{// clang-format off
+    {"0", "0", " 0", "1", "1", " 1", "a", "z", "", "true", "false", "null", "true", "false", "nan", "nan"},
+     cudf::test::iterators::nulls_at(std::vector<int>{8})};
+  // clang-format on
+  auto bool_col = bool_wrapper{{false,
+                                false,
+                                bool_NA,
+                                true,
+                                true,
+                                bool_NA,
+                                bool_NA,
+                                bool_NA,
+                                bool_NA,
+                                true,
+                                false,
+                                bool_NA,
+                                true,
+                                false,
+                                bool_NA,
+                                bool_NA},
+                               cudf::test::iterators::nulls_at(std::vector<int>{8})};
+
+  // Types to test
+  const std::vector<data_type> dtypes = {
+    dtype<int32_t>(), dtype<float>(), dtype<cudf::string_view>(), dtype<bool>()};
+  const std::vector<cudf::column_view> cols{cudf::column_view(int_col),
+                                            cudf::column_view(float_col),
+                                            cudf::column_view(str_col),
+                                            cudf::column_view(bool_col)};
+  for (size_t col_type = 0; col_type < cols.size(); col_type++) {
+    std::map<std::string, cudf::io::schema_element> dtype_schema{{"0", {dtypes[col_type]}}};
+    cudf::io::json_reader_options in_options =
+      cudf::io::json_reader_options::builder(cudf::io::source_info{data.data(), data.size()})
+        .dtypes(dtype_schema)
+        .lines(true)
+        .experimental(test_experimental);
+
+    cudf::io::table_with_metadata result = cudf::io::read_json(in_options);
+
+    EXPECT_EQ(result.tbl->num_columns(), 1);
+    EXPECT_EQ(result.tbl->num_rows(), 16);
+    EXPECT_EQ(result.metadata.schema_info[0].name, "0");
+
+    EXPECT_EQ(result.tbl->get_column(0).type().id(), dtypes[col_type].id());
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->get_column(0), cols[col_type]);
+  }
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/json_tree.cpp b/cpp/tests/io/json_tree.cpp
index 3d024fe8af8..d3cd1dd9490 100644
--- a/cpp/tests/io/json_tree.cpp
+++ b/cpp/tests/io/json_tree.cpp
@@ -132,7 +132,7 @@ void print_tree(tree_meta_t2 const& cpu_tree)
 }
 void print_tree(tree_meta_t const& d_gpu_tree)
 {
-  auto const cpu_tree = to_cpu_tree(d_gpu_tree, rmm::cuda_stream_default);
+  auto const cpu_tree = to_cpu_tree(d_gpu_tree, cudf::get_default_stream());
   print_tree(cpu_tree);
 }
 
@@ -161,7 +161,7 @@ bool compare_vector(std::vector<T> const& cpu_vec,
                     rmm::device_uvector<T> const& d_vec,
                     std::string const& name)
 {
-  auto gpu_vec = cudf::detail::make_std_vector_async(d_vec, cudf::default_stream_value);
+  auto gpu_vec = cudf::detail::make_std_vector_async(d_vec, cudf::get_default_stream());
   return compare_vector(cpu_vec, gpu_vec, name);
 }
 
@@ -173,7 +173,7 @@ void compare_trees(tree_meta_t2 const& cpu_tree, tree_meta_t const& d_gpu_tree,
   EXPECT_EQ(cpu_num_nodes, d_gpu_tree.node_levels.size());
   EXPECT_EQ(cpu_num_nodes, d_gpu_tree.node_range_begin.size());
   EXPECT_EQ(cpu_num_nodes, d_gpu_tree.node_range_end.size());
-  auto gpu_tree = to_cpu_tree(d_gpu_tree, cudf::default_stream_value);
+  auto gpu_tree = to_cpu_tree(d_gpu_tree, cudf::get_default_stream());
   bool mismatch = false;
 
 #define COMPARE_MEMBER(member)                                                       \
@@ -535,7 +535,7 @@ struct JsonTest : public cudf::test::BaseFixture {
 
 TEST_F(JsonTest, TreeRepresentation)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   // Test input
   std::string const input = R"(  [{)"
@@ -632,7 +632,7 @@ TEST_F(JsonTest, TreeRepresentation)
 
 TEST_F(JsonTest, TreeRepresentation2)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
   // Test input: value end with comma, space, close-brace ", }"
   std::string const input =
     // 0         1         2         3         4         5         6         7         8         9
@@ -707,7 +707,7 @@ TEST_F(JsonTest, TreeRepresentation2)
 
 TEST_F(JsonTest, TreeRepresentation3)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
   // Test input: Json lines with same TreeRepresentation2 input
   std::string const input =
     R"(  {}
@@ -735,6 +735,26 @@ TEST_F(JsonTest, TreeRepresentation3)
   if (std::getenv("NJP_DEBUG_DUMP") != nullptr) { print_tree_representation(input, cpu_tree); }
 }
 
+TEST_F(JsonTest, TreeRepresentationError)
+{
+  auto const stream       = cudf::get_default_stream();
+  std::string const input = R"([ {}, }{])";
+  // Prepare input & output buffers
+  cudf::string_scalar const d_scalar(input, true, stream);
+  auto const d_input = cudf::device_span<cuio_json::SymbolT const>{
+    d_scalar.data(), static_cast<size_t>(d_scalar.size())};
+  cudf::io::json_reader_options const options{};
+
+  // Parse the JSON and get the token stream
+  const auto [tokens_gpu, token_indices_gpu] =
+    cudf::io::json::detail::get_token_stream(d_input, options, stream);
+
+  // Get the JSON's tree representation
+  // This JSON is invalid and will raise an exception.
+  EXPECT_THROW(cuio_json::detail::get_tree_representation(tokens_gpu, token_indices_gpu, stream),
+               cudf::logic_error);
+}
+
 /**
  * @brief Test fixture for parametrized JSON tree traversal tests
  */
@@ -773,7 +793,11 @@ std::vector<std::string> json_lines_list = {
  { "a": { "y" : 6, "z": [] }}
  { "a": { "y" : 6, "z": [2, 3, 4, 5] }}
  { "a": { "z": [4], "y" : 6 }}
- { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )"};
+ { "a" : { "x" : 8, "y": 9 }, "b" : {"x": 10 , "z": 11 }} )",
+  // empty list, row.
+  R"( {"a" : [], "b" : {}}
+ {"a" : []}
+ {"b" : {}})"};
 INSTANTIATE_TEST_SUITE_P(Mixed_And_Records,
                          JsonTreeTraversalTest,
                          ::testing::Combine(::testing::Values(false),
@@ -786,7 +810,7 @@ INSTANTIATE_TEST_SUITE_P(JsonLines,
 TEST_P(JsonTreeTraversalTest, CPUvsGPUTraversal)
 {
   auto [json_lines, input] = GetParam();
-  auto stream              = cudf::default_stream_value;
+  auto stream              = cudf::get_default_stream();
   cudf::io::json_reader_options options{};
   options.enable_lines(json_lines);
 
diff --git a/cpp/tests/io/json_type_cast_test.cu b/cpp/tests/io/json_type_cast_test.cu
index 43702f1f7e7..2170ce4a3e2 100644
--- a/cpp/tests/io/json_type_cast_test.cu
+++ b/cpp/tests/io/json_type_cast_test.cu
@@ -51,7 +51,7 @@ auto default_json_options()
 {
   auto parse_opts = cudf::io::parse_options{',', '\n', '\"', '.'};
 
-  auto const stream     = cudf::default_stream_value;
+  auto const stream     = cudf::get_default_stream();
   parse_opts.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
   parse_opts.trie_false = cudf::detail::create_serialized_trie({"false"}, stream);
   parse_opts.trie_na    = cudf::detail::create_serialized_trie({"", "null"}, stream);
@@ -60,7 +60,7 @@ auto default_json_options()
 
 TEST_F(JSONTypeCastTest, String)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
   auto mr           = rmm::mr::get_current_device_resource();
   auto const type   = cudf::data_type{cudf::type_id::STRING};
 
@@ -70,7 +70,7 @@ TEST_F(JSONTypeCastTest, String)
 
   auto d_column = cudf::column_device_view::create(input);
   rmm::device_uvector<thrust::pair<const char*, cudf::size_type>> svs(d_column->size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     d_column->pair_begin<cudf::string_view, false>(),
                     d_column->pair_end<cudf::string_view, false>(),
                     svs.begin(),
@@ -93,14 +93,14 @@ TEST_F(JSONTypeCastTest, String)
 
 TEST_F(JSONTypeCastTest, Int)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
   auto mr           = rmm::mr::get_current_device_resource();
   auto const type   = cudf::data_type{cudf::type_id::INT64};
 
   cudf::test::strings_column_wrapper data({"1", "null", "3", "true", "5", "false"});
   auto d_column = cudf::column_device_view::create(data);
   rmm::device_uvector<thrust::pair<const char*, cudf::size_type>> svs(d_column->size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     d_column->pair_begin<cudf::string_view, false>(),
                     d_column->pair_end<cudf::string_view, false>(),
                     svs.begin(),
@@ -120,7 +120,7 @@ TEST_F(JSONTypeCastTest, Int)
 
 TEST_F(JSONTypeCastTest, StringEscapes)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
   auto mr           = rmm::mr::get_current_device_resource();
   auto const type   = cudf::data_type{cudf::type_id::STRING};
 
@@ -137,7 +137,7 @@ TEST_F(JSONTypeCastTest, StringEscapes)
   });
   auto d_column = cudf::column_device_view::create(data);
   rmm::device_uvector<thrust::pair<const char*, cudf::size_type>> svs(d_column->size(), stream);
-  thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     d_column->pair_begin<cudf::string_view, false>(),
                     d_column->pair_end<cudf::string_view, false>(),
                     svs.begin(),
diff --git a/cpp/tests/io/nested_json_test.cpp b/cpp/tests/io/nested_json_test.cpp
index 65926be495f..5a556f35501 100644
--- a/cpp/tests/io/nested_json_test.cpp
+++ b/cpp/tests/io/nested_json_test.cpp
@@ -139,7 +139,7 @@ TEST_F(JsonTest, StackContext)
   using StackSymbolT = char;
 
   // Prepare cuda stream for data transfers & kernels
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   // Test input
   std::string const input = R"(  [{)"
@@ -200,7 +200,7 @@ TEST_F(JsonTest, StackContextUtf8)
   using StackSymbolT = char;
 
   // Prepare cuda stream for data transfers & kernels
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   // Test input
   std::string const input = R"([{"a":{"year":1882,"author": "Bharathi"}, {"a":"filip ʒakotɛ"}}])";
@@ -251,7 +251,7 @@ TEST_F(JsonTest, TokenStream)
                             R"("price": 8.95)"
                             R"(}] )";
 
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
@@ -387,7 +387,7 @@ TEST_F(JsonTest, TokenStream2)
     R"([ {}, { "a": { "y" : 6, "z": [] }}, { "a" : { "x" : 8, "y": 9}, "b" : {"x": 10 , "z": 11)"
     "\n}}]";
 
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   // Default parsing options
   cudf::io::json_reader_options default_options{};
@@ -462,7 +462,7 @@ TEST_P(JsonParserTest, ExtractColumn)
                                        : cuio_json::detail::host_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
   auto mr           = rmm::mr::get_current_device_resource();
 
   // Default parsing options
@@ -489,7 +489,7 @@ TEST_P(JsonParserTest, ExtractColumn)
 TEST_P(JsonParserTest, UTF_JSON)
 {
   // Prepare cuda stream for data transfers & kernels
-  auto const stream      = cudf::default_stream_value;
+  auto const stream      = cudf::get_default_stream();
   auto mr                = rmm::mr::get_current_device_resource();
   bool const is_full_gpu = GetParam();
   auto json_parser       = is_full_gpu ? cuio_json::detail::device_parse_nested_json
@@ -539,7 +539,7 @@ TEST_P(JsonParserTest, ExtractColumnWithQuotes)
                                        : cuio_json::detail::host_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
   auto mr           = rmm::mr::get_current_device_resource();
 
   // Default parsing options
@@ -572,7 +572,7 @@ TEST_P(JsonParserTest, ExpectFailMixStructAndList)
                                        : cuio_json::detail::host_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
   auto mr           = rmm::mr::get_current_device_resource();
 
   // Default parsing options
@@ -588,11 +588,11 @@ TEST_P(JsonParserTest, ExpectFailMixStructAndList)
     R"( [{"a":[123, {"0": 123}], "b":1.0}, {"b":1.1}, {"b":2.1}] )",
     R"( [{"a":[123, "123"], "b":1.0}, {"b":1.1}, {"b":2.1}] )"};
 
+  // libcudf does not currently support a mix of lists and structs.
   for (auto const& input : inputs_fail) {
-    CUDF_EXPECT_THROW_MESSAGE(
-      auto const cudf_table = json_parser(
-        cudf::host_span<SymbolT const>{input.data(), input.size()}, options, stream, mr),
-      "A mix of lists and structs within the same column is not supported");
+    EXPECT_THROW(auto const cudf_table = json_parser(
+                   cudf::host_span<SymbolT const>{input.data(), input.size()}, options, stream, mr),
+                 cudf::logic_error);
   }
 
   for (auto const& input : inputs_succeed) {
@@ -610,7 +610,7 @@ TEST_P(JsonParserTest, EmptyString)
                                        : cuio_json::detail::host_parse_nested_json;
 
   // Prepare cuda stream for data transfers & kernels
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
   auto mr           = rmm::mr::get_current_device_resource();
 
   // Default parsing options
@@ -624,3 +624,5 @@ TEST_P(JsonParserTest, EmptyString)
   auto const expected_col_count = 0;
   EXPECT_EQ(cudf_table.tbl->num_columns(), expected_col_count);
 }
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/orc_test.cpp b/cpp/tests/io/orc_test.cpp
index a658ed0a55d..77496fe5c4b 100644
--- a/cpp/tests/io/orc_test.cpp
+++ b/cpp/tests/io/orc_test.cpp
@@ -42,8 +42,6 @@
 #define ZSTD_SUPPORTED 0
 #endif
 
-namespace cudf_io = cudf::io;
-
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
   typename std::conditional<std::is_same_v<T, cudf::string_view>,
@@ -145,6 +143,10 @@ struct OrcReaderTest : public cudf::test::BaseFixture {
 struct OrcStatisticsTest : public cudf::test::BaseFixture {
 };
 
+// Test fixture for metadata tests
+struct OrcMetadataReaderTest : public cudf::test::BaseFixture {
+};
+
 namespace {
 // Generates a vector of uniform random values of type T
 template <typename T>
@@ -182,9 +184,9 @@ struct SkipRowTest {
       sequence, sequence + file_num_rows);
     table_view input_table({input_col});
 
-    cudf_io::orc_writer_options out_opts =
-      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, input_table);
-    cudf_io::write_orc(out_opts);
+    cudf::io::orc_writer_options out_opts =
+      cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, input_table);
+    cudf::io::write_orc(out_opts);
 
     auto begin_sequence = sequence, end_sequence = sequence;
     if (skip_rows < file_num_rows) {
@@ -203,12 +205,12 @@ struct SkipRowTest {
     auto filepath =
       temp_env->get_temp_filepath("SkipRowTest" + std::to_string(test_calls++) + ".orc");
     auto expected_result = get_expected_result(filepath, skip_rows, file_num_rows, read_num_rows);
-    cudf_io::orc_reader_options in_opts =
-      cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::orc_reader_options in_opts =
+      cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
         .use_index(false)
         .skip_rows(skip_rows)
         .num_rows(read_num_rows);
-    auto result = cudf_io::read_orc(in_opts);
+    auto result = cudf::io::read_orc(in_opts);
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected_result->view(), result.tbl->view());
   }
 
@@ -218,11 +220,11 @@ struct SkipRowTest {
       temp_env->get_temp_filepath("SkipRowTest" + std::to_string(test_calls++) + ".orc");
     auto expected_result =
       get_expected_result(filepath, skip_rows, file_num_rows, file_num_rows - skip_rows);
-    cudf_io::orc_reader_options in_opts =
-      cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::orc_reader_options in_opts =
+      cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
         .use_index(false)
         .skip_rows(skip_rows);
-    auto result = cudf_io::read_orc(in_opts);
+    auto result = cudf::io::read_orc(in_opts);
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected_result->view(), result.tbl->view());
   }
 };
@@ -239,13 +241,13 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumn)
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_orc(out_opts);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
@@ -261,13 +263,13 @@ TYPED_TEST(OrcWriterNumericTypeTest, SingleColumnWithNulls)
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumnWithNulls.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_orc(out_opts);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
@@ -283,15 +285,15 @@ TYPED_TEST(OrcWriterTimestampTypeTest, Timestamps)
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcTimestamps.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_orc(out_opts);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
       .use_index(false)
       .timestamp_type(this->type());
-  auto result = cudf_io::read_orc(in_opts);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
@@ -309,15 +311,15 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampsWithNulls)
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcTimestampsWithNulls.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_orc(out_opts);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
       .use_index(false)
       .timestamp_type(this->type());
-  auto result = cudf_io::read_orc(in_opts);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
@@ -333,15 +335,15 @@ TYPED_TEST(OrcWriterTimestampTypeTest, TimestampOverflow)
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcTimestampOverflow.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_orc(out_opts);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
       .use_index(false)
       .timestamp_type(this->type());
-  auto result = cudf_io::read_orc(in_opts);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
@@ -381,7 +383,7 @@ TEST_F(OrcWriterTest, MultiColumn)
 
   table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8, col9});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("bools");
   expected_metadata.column_metadata[1].set_name("int8s");
   expected_metadata.column_metadata[2].set_name("int16s");
@@ -394,14 +396,14 @@ TEST_F(OrcWriterTest, MultiColumn)
   expected_metadata.column_metadata[9].set_name("structs");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -449,7 +451,7 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   struct_col col8{{ages_col}, {0, 1, 1, 0, 1, 1, 0, 1, 1, 0}};
   table_view expected({col0, col1, col2, col3, col4, col5, col6, col7, col8});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("bools");
   expected_metadata.column_metadata[1].set_name("int8s");
   expected_metadata.column_metadata[2].set_name("int16s");
@@ -461,14 +463,14 @@ TEST_F(OrcWriterTest, MultiColumnWithNulls)
   expected_metadata.column_metadata[8].set_name("structs");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumnWithNulls.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -484,15 +486,15 @@ TEST_F(OrcWriterTest, ReadZeroRows)
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("OrcSingleColumn.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_orc(out_opts);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
       .use_index(false)
       .num_rows(0);
-  auto result = cudf_io::read_orc(in_opts);
+  auto result = cudf::io::read_orc(in_opts);
 
   EXPECT_EQ(0, result.tbl->num_rows());
   EXPECT_EQ(1, result.tbl->num_columns());
@@ -513,20 +515,20 @@ TEST_F(OrcWriterTest, Strings)
 
   table_view expected({col0, col1, col2});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("col_other");
   expected_metadata.column_metadata[1].set_name("col_string");
   expected_metadata.column_metadata[2].set_name("col_another");
 
   auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -560,7 +562,7 @@ TEST_F(OrcWriterTest, SlicedTable)
 
   table_view expected({col0, col1, col2, col3, col4, col5});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("col_other");
   expected_metadata.column_metadata[1].set_name("col_string");
   expected_metadata.column_metadata[2].set_name("col_another");
@@ -571,14 +573,14 @@ TEST_F(OrcWriterTest, SlicedTable)
   auto expected_slice = cudf::slice(expected, {2, static_cast<cudf::size_type>(num_rows)});
 
   auto filepath = temp_env->get_temp_filepath("SlicedTable.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected_slice)
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
       .metadata(&expected_metadata);
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -592,19 +594,20 @@ TEST_F(OrcWriterTest, HostBuffer)
 
   table_view expected{{col}};
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("col_other");
 
   std::vector<char> out_buffer;
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected)
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), expected)
       .metadata(&expected_metadata);
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info(out_buffer.data(), out_buffer.size()))
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(
+      cudf::io::source_info(out_buffer.data(), out_buffer.size()))
       .use_index(false);
-  const auto result = cudf_io::read_orc(in_opts);
+  const auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -625,14 +628,14 @@ TEST_F(OrcWriterTest, negTimestampsNano)
   table_view expected({timestamps_ns});
 
   auto filepath = temp_env->get_temp_filepath("OrcNegTimestamp.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
 
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(
     expected.column(0), result.tbl->view().column(0), cudf::test::debug_output_level::ALL_ERRORS);
@@ -647,13 +650,13 @@ TEST_F(OrcWriterTest, Slice)
   cudf::table_view tbl{result};
 
   auto filepath = temp_env->get_temp_filepath("Slice.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
-  cudf_io::write_orc(out_opts);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto read_table = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto read_table = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(read_table.tbl->view(), tbl);
 }
@@ -664,13 +667,13 @@ TEST_F(OrcChunkedWriterTest, SingleTable)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedSingle.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer(opts).write(*table1);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(*table1);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *table1);
 }
@@ -684,13 +687,13 @@ TEST_F(OrcChunkedWriterTest, SimpleTable)
   auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedSimple.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer(opts).write(*table1).write(*table2);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(*table1).write(*table2);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
@@ -704,13 +707,13 @@ TEST_F(OrcChunkedWriterTest, LargeTables)
   auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedLarge.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer(opts).write(*table1).write(*table2);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(*table1).write(*table2);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
@@ -730,17 +733,17 @@ TEST_F(OrcChunkedWriterTest, ManyTables)
   auto expected = cudf::concatenate(table_views);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedManyTables.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer writer(opts);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer writer(opts);
   std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) {
     writer.write(tbl);
   });
   writer.close();
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
@@ -760,20 +763,20 @@ TEST_F(OrcChunkedWriterTest, Metadata)
 
   table_view expected({col0, col1, col2});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("col_other");
   expected_metadata.column_metadata[1].set_name("col_string");
   expected_metadata.column_metadata[2].set_name("col_another");
 
   auto filepath = temp_env->get_temp_filepath("ChunkedMetadata.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath})
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath})
       .metadata(&expected_metadata);
-  cudf_io::orc_chunked_writer(opts).write(expected).write(expected);
+  cudf::io::orc_chunked_writer(opts).write(expected).write(expected);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
 
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
@@ -793,13 +796,13 @@ TEST_F(OrcChunkedWriterTest, Strings)
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedStrings.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
@@ -811,9 +814,9 @@ TEST_F(OrcChunkedWriterTest, MismatchedTypes)
   auto table2 = create_random_fixed_table<float>(4, 4, true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedTypes.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer writer(opts);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer writer(opts);
   writer.write(*table1);
   EXPECT_THROW(writer.write(*table2), cudf::logic_error);
 }
@@ -824,9 +827,9 @@ TEST_F(OrcChunkedWriterTest, ChunkedWritingAfterClosing)
   auto table1 = create_random_fixed_table<int>(4, 4, true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedWritingAfterClosing.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer writer(opts);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer writer(opts);
   writer.write(*table1);
   writer.close();
   EXPECT_THROW(writer.write(*table1), cudf::logic_error);
@@ -839,9 +842,9 @@ TEST_F(OrcChunkedWriterTest, MismatchedStructure)
   auto table2 = create_random_fixed_table<int>(3, 4, true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedStructure.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer writer(opts);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer writer(opts);
   writer.write(*table1);
   EXPECT_THROW(writer.write(*table2), cudf::logic_error);
 }
@@ -855,13 +858,13 @@ TEST_F(OrcChunkedWriterTest, ReadStripes)
   auto full_table = cudf::concatenate(std::vector<table_view>({*table2, *table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedStripes.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer(opts).write(*table1).write(*table2);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(*table1).write(*table2);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).stripes({{1, 0, 1}});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).stripes({{1, 0, 1}});
+  auto result = cudf::io::read_orc(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
@@ -872,15 +875,15 @@ TEST_F(OrcChunkedWriterTest, ReadStripesError)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedStripesError.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer(opts).write(*table1);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(*table1);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).stripes({{0, 1}});
-  EXPECT_THROW(cudf_io::read_orc(read_opts), cudf::logic_error);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).stripes({{0, 1}});
+  EXPECT_THROW(cudf::io::read_orc(read_opts), cudf::logic_error);
   read_opts.set_stripes({{-1}});
-  EXPECT_THROW(cudf_io::read_orc(read_opts), cudf::logic_error);
+  EXPECT_THROW(cudf::io::read_orc(read_opts), cudf::logic_error);
 }
 
 TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
@@ -915,13 +918,13 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize)
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
@@ -958,13 +961,13 @@ TYPED_TEST(OrcChunkedWriterNumericTypeTest, UnalignedSize2)
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.orc");
-  cudf_io::chunked_orc_writer_options opts =
-    cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
+  cudf::io::chunked_orc_writer_options opts =
+    cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::orc_chunked_writer(opts).write(tbl1).write(tbl2);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
@@ -1002,30 +1005,30 @@ TEST_F(OrcStatisticsTest, Basic)
 
   auto filepath = temp_env->get_temp_filepath("OrcStatsMerge.orc");
 
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_orc(out_opts);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_orc(out_opts);
 
-  auto const stats = cudf_io::read_parsed_orc_statistics(cudf_io::source_info{filepath});
+  auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath});
 
   auto const expected_column_names =
     std::vector<std::string>{"", "_col0", "_col1", "_col2", "_col3", "_col4"};
   EXPECT_EQ(stats.column_names, expected_column_names);
 
-  auto validate_statistics = [&](std::vector<cudf_io::column_statistics> const& stats) {
+  auto validate_statistics = [&](std::vector<cudf::io::column_statistics> const& stats) {
     auto& s0 = stats[0];
     EXPECT_EQ(*s0.number_of_values, 9ul);
 
     auto& s1 = stats[1];
     EXPECT_EQ(*s1.number_of_values, 4ul);
-    auto& ts1 = std::get<cudf_io::integer_statistics>(s1.type_specific_stats);
+    auto& ts1 = std::get<cudf::io::integer_statistics>(s1.type_specific_stats);
     EXPECT_EQ(*ts1.minimum, 1);
     EXPECT_EQ(*ts1.maximum, 7);
     EXPECT_EQ(*ts1.sum, 16);
 
     auto& s2 = stats[2];
     EXPECT_EQ(*s2.number_of_values, 4ul);
-    auto& ts2 = std::get<cudf_io::double_statistics>(s2.type_specific_stats);
+    auto& ts2 = std::get<cudf::io::double_statistics>(s2.type_specific_stats);
     EXPECT_EQ(*ts2.minimum, 1.);
     EXPECT_EQ(*ts2.maximum, 7.);
     // No sum ATM, filed #7087
@@ -1033,18 +1036,18 @@ TEST_F(OrcStatisticsTest, Basic)
 
     auto& s3 = stats[3];
     EXPECT_EQ(*s3.number_of_values, 9ul);
-    auto& ts3 = std::get<cudf_io::string_statistics>(s3.type_specific_stats);
+    auto& ts3 = std::get<cudf::io::string_statistics>(s3.type_specific_stats);
     EXPECT_EQ(*ts3.minimum, "Friday");
     EXPECT_EQ(*ts3.maximum, "Wednesday");
     EXPECT_EQ(*ts3.sum, 58ul);
 
     auto& s4 = stats[4];
     EXPECT_EQ(*s4.number_of_values, 9ul);
-    EXPECT_EQ(std::get<cudf_io::bucket_statistics>(s4.type_specific_stats).count[0], 8ul);
+    EXPECT_EQ(std::get<cudf::io::bucket_statistics>(s4.type_specific_stats).count[0], 8ul);
 
     auto& s5 = stats[5];
     EXPECT_EQ(*s5.number_of_values, 4ul);
-    auto& ts5 = std::get<cudf_io::timestamp_statistics>(s5.type_specific_stats);
+    auto& ts5 = std::get<cudf::io::timestamp_statistics>(s5.type_specific_stats);
     EXPECT_EQ(*ts5.minimum_utc, 1000);
     EXPECT_EQ(*ts5.maximum_utc, 7000);
     ASSERT_FALSE(ts5.minimum);
@@ -1070,18 +1073,18 @@ TEST_F(OrcWriterTest, SlicedValidMask)
   auto sliced_col = cudf::slice(static_cast<cudf::column_view>(col), indices);
   cudf::table_view tbl{sliced_col};
 
-  cudf_io::table_input_metadata expected_metadata(tbl);
+  cudf::io::table_input_metadata expected_metadata(tbl);
   expected_metadata.column_metadata[0].set_name("col_string");
 
   auto filepath = temp_env->get_temp_filepath("OrcStrings.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl)
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl)
       .metadata(&expected_metadata);
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(tbl, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -1093,13 +1096,13 @@ TEST_F(OrcReaderTest, SingleInputs)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
 
   auto filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
-  cudf_io::orc_writer_options write_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath1}, table1->view());
-  cudf_io::write_orc(write_opts);
+  cudf::io::orc_writer_options write_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath1}, table1->view());
+  cudf::io::write_orc(write_opts);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{{filepath1}});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath1}});
+  auto result = cudf::io::read_orc(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *table1);
 }
@@ -1125,11 +1128,11 @@ TEST_F(OrcReaderTest, zstdCompressionRegression)
 
   auto source =
     cudf::io::source_info(reinterpret_cast<const char*>(input_buffer), sizeof(input_buffer));
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(source).use_index(false);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(source).use_index(false);
 
   cudf::io::table_with_metadata result;
-  CUDF_EXPECT_NO_THROW(result = cudf_io::read_orc(in_opts));
+  CUDF_EXPECT_NO_THROW(result = cudf::io::read_orc(in_opts));
   EXPECT_EQ(1920800, result.tbl->num_rows());
 }
 
@@ -1143,21 +1146,21 @@ TEST_F(OrcReaderTest, MultipleInputs)
 
   auto const filepath1 = temp_env->get_temp_filepath("SimpleTable1.orc");
   {
-    cudf_io::orc_writer_options out_opts =
-      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath1}, table1->view());
-    cudf_io::write_orc(out_opts);
+    cudf::io::orc_writer_options out_opts =
+      cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath1}, table1->view());
+    cudf::io::write_orc(out_opts);
   }
 
   auto const filepath2 = temp_env->get_temp_filepath("SimpleTable2.orc");
   {
-    cudf_io::orc_writer_options out_opts =
-      cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath2}, table2->view());
-    cudf_io::write_orc(out_opts);
+    cudf::io::orc_writer_options out_opts =
+      cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath2}, table2->view());
+    cudf::io::write_orc(out_opts);
   }
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{{filepath1, filepath2}});
-  auto result = cudf_io::read_orc(read_opts);
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{{filepath1, filepath2}});
+  auto result = cudf::io::read_orc(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
@@ -1180,14 +1183,14 @@ TEST_P(OrcWriterTestDecimal, Decimal64)
   cudf::table_view tbl({static_cast<cudf::column_view>(col)});
 
   auto filepath = temp_env->get_temp_filepath("Decimal64.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
 
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(tbl.column(0), result.tbl->view().column(0));
 }
@@ -1211,14 +1214,14 @@ TEST_F(OrcWriterTest, Decimal32)
   cudf::table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("Decimal32.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected);
 
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, result.tbl->view().column(0));
 }
@@ -1248,15 +1251,15 @@ TEST_F(OrcStatisticsTest, Overflow)
 
   auto filepath = temp_env->get_temp_filepath("OrcStatsOverflow.orc");
 
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
-  cudf_io::write_orc(out_opts);
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+  cudf::io::write_orc(out_opts);
 
-  auto const stats = cudf_io::read_parsed_orc_statistics(cudf_io::source_info{filepath});
+  auto const stats = cudf::io::read_parsed_orc_statistics(cudf::io::source_info{filepath});
 
   auto check_sum_exist = [&](int idx, bool expected) {
     auto const& s  = stats.file_stats[idx];
-    auto const& ts = std::get<cudf_io::integer_statistics>(s.type_specific_stats);
+    auto const& ts = std::get<cudf::io::integer_statistics>(s.type_specific_stats);
     EXPECT_EQ(ts.sum.has_value(), expected);
   };
   check_sum_exist(1, false);
@@ -1311,8 +1314,8 @@ TEST_F(OrcStatisticsTest, HasNull)
     0x4F, 0x52, 0x43, 0x17,
   };
 
-  auto const stats = cudf_io::read_parsed_orc_statistics(
-    cudf_io::source_info{reinterpret_cast<char const*>(nulls_orc.data()), nulls_orc.size()});
+  auto const stats = cudf::io::read_parsed_orc_statistics(
+    cudf::io::source_info{reinterpret_cast<char const*>(nulls_orc.data()), nulls_orc.size()});
 
   EXPECT_EQ(stats.file_stats[1].has_null, true);
   EXPECT_EQ(stats.file_stats[2].has_null, false);
@@ -1343,35 +1346,35 @@ TEST_P(OrcWriterTestStripes, StripeSize)
   auto validate = [&](std::vector<char> const& orc_buffer) {
     auto const expected_stripe_num =
       std::max<cudf::size_type>(num_rows / size_rows, (num_rows * sizeof(int64_t)) / size_bytes);
-    auto const stats = cudf_io::read_parsed_orc_statistics(
-      cudf_io::source_info(orc_buffer.data(), orc_buffer.size()));
+    auto const stats = cudf::io::read_parsed_orc_statistics(
+      cudf::io::source_info(orc_buffer.data(), orc_buffer.size()));
     EXPECT_EQ(stats.stripes_stats.size(), expected_stripe_num);
 
-    cudf_io::orc_reader_options in_opts =
-      cudf_io::orc_reader_options::builder(
-        cudf_io::source_info(orc_buffer.data(), orc_buffer.size()))
+    cudf::io::orc_reader_options in_opts =
+      cudf::io::orc_reader_options::builder(
+        cudf::io::source_info(orc_buffer.data(), orc_buffer.size()))
         .use_index(false);
-    auto result = cudf_io::read_orc(in_opts);
+    auto result = cudf::io::read_orc(in_opts);
 
     CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), result.tbl->view());
   };
 
   {
     std::vector<char> out_buffer_chunked;
-    cudf_io::chunked_orc_writer_options opts =
-      cudf_io::chunked_orc_writer_options::builder(cudf_io::sink_info(&out_buffer_chunked))
+    cudf::io::chunked_orc_writer_options opts =
+      cudf::io::chunked_orc_writer_options::builder(cudf::io::sink_info(&out_buffer_chunked))
         .stripe_size_rows(size_rows)
         .stripe_size_bytes(size_bytes);
-    cudf_io::orc_chunked_writer(opts).write(expected->view());
+    cudf::io::orc_chunked_writer(opts).write(expected->view());
     validate(out_buffer_chunked);
   }
   {
     std::vector<char> out_buffer;
-    cudf_io::orc_writer_options out_opts =
-      cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), expected->view())
+    cudf::io::orc_writer_options out_opts =
+      cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), expected->view())
         .stripe_size_rows(size_rows)
         .stripe_size_bytes(size_bytes);
-    cudf_io::write_orc(out_opts);
+    cudf::io::write_orc(out_opts);
     validate(out_buffer);
   }
 }
@@ -1392,15 +1395,15 @@ TEST_F(OrcWriterTest, StripeSizeInvalid)
   std::vector<char> out_buffer;
 
   EXPECT_THROW(
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), unused_table->view())
       .stripe_size_rows(511),
     cudf::logic_error);
   EXPECT_THROW(
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), unused_table->view())
       .stripe_size_bytes(63 << 10),
     cudf::logic_error);
   EXPECT_THROW(
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info(&out_buffer), unused_table->view())
       .row_index_stride(511),
     cudf::logic_error);
 }
@@ -1438,18 +1441,18 @@ TEST_F(OrcWriterTest, TestMap)
 
   table_view expected({*list_col});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_list_column_as_map();
 
   auto filepath = temp_env->get_temp_filepath("MapColumn.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath}).use_index(false);
-  auto result = cudf_io::read_orc(in_opts);
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath}).use_index(false);
+  auto result = cudf::io::read_orc(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -1466,22 +1469,22 @@ TEST_F(OrcReaderTest, NestedColumnSelection)
   struct_col s_col{child_col1, child_col2};
   table_view expected({s_col});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("struct_s");
   expected_metadata.column_metadata[0].child(0).set_name("field_a");
   expected_metadata.column_metadata[0].child(1).set_name("field_b");
 
   auto filepath = temp_env->get_temp_filepath("OrcNestedSelection.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
       .use_index(false)
       .columns({"struct_s.field_b"});
-  auto result = cudf_io::read_orc(in_opts);
+  auto result = cudf::io::read_orc(in_opts);
 
   // Verify that only one child column is included in the output table
   ASSERT_EQ(1, result.tbl->view().column(0).num_children());
@@ -1503,20 +1506,20 @@ TEST_F(OrcReaderTest, DecimalOptions)
   dec128_col col{col_data, col_data + num_rows, mask};
   table_view expected({col});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("dec");
 
   auto filepath = temp_env->get_temp_filepath("OrcDecimalOptions.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options valid_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::orc_reader_options valid_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
       .decimal128_columns({"dec", "fake_name"});
   // Should not throw, even with "fake name"
-  EXPECT_NO_THROW(cudf_io::read_orc(valid_opts));
+  EXPECT_NO_THROW(cudf::io::read_orc(valid_opts));
 }
 
 TEST_F(OrcWriterTest, DecimalOptionsNested)
@@ -1547,24 +1550,24 @@ TEST_F(OrcWriterTest, DecimalOptionsNested)
 
   table_view expected({*map_list_col});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("maps");
   expected_metadata.column_metadata[0].set_list_column_as_map();
   expected_metadata.column_metadata[0].child(1).child(0).child(0).set_name("dec64");
   expected_metadata.column_metadata[0].child(1).child(0).child(1).set_name("dec128");
 
   auto filepath = temp_env->get_temp_filepath("OrcMultiColumn.orc");
-  cudf_io::orc_writer_options out_opts =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_orc(out_opts);
+  cudf::io::write_orc(out_opts);
 
-  cudf_io::orc_reader_options in_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::orc_reader_options in_opts =
+    cudf::io::orc_reader_options::builder(cudf::io::source_info{filepath})
       .use_index(false)
       // One less level of nesting because children of map columns are the child struct's children
       .decimal128_columns({"maps.0.dec64"});
-  auto result = cudf_io::read_orc(in_opts);
+  auto result = cudf::io::read_orc(in_opts);
 
   // Both columns should be read as decimal128
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result.tbl->view().column(0).child(1).child(0).child(0),
@@ -1577,17 +1580,119 @@ TEST_F(OrcReaderTest, EmptyColumnsParam)
   auto const expected = create_random_fixed_table<int>(2, 4, false);
 
   std::vector<char> out_buffer;
-  cudf_io::orc_writer_options args =
-    cudf_io::orc_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected);
-  cudf_io::write_orc(args);
+  cudf::io::orc_writer_options args =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{&out_buffer}, *expected);
+  cudf::io::write_orc(args);
 
-  cudf_io::orc_reader_options read_opts =
-    cudf_io::orc_reader_options::builder(cudf_io::source_info{out_buffer.data(), out_buffer.size()})
+  cudf::io::orc_reader_options read_opts =
+    cudf::io::orc_reader_options::builder(
+      cudf::io::source_info{out_buffer.data(), out_buffer.size()})
       .columns({});
-  auto const result = cudf_io::read_orc(read_opts);
+  auto const result = cudf::io::read_orc(read_opts);
 
   EXPECT_EQ(result.tbl->num_columns(), 0);
   EXPECT_EQ(result.tbl->num_rows(), 0);
 }
 
+TEST_F(OrcMetadataReaderTest, TestBasic)
+{
+  auto const num_rows = 1'200'000;
+
+  auto ints   = random_values<int>(num_rows);
+  auto floats = random_values<float>(num_rows);
+  int32_col int_col(ints.begin(), ints.end());
+  float32_col float_col(floats.begin(), floats.end());
+
+  table_view expected({int_col, float_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("int_col");
+  expected_metadata.column_metadata[1].set_name("float_col");
+
+  auto filepath = temp_env->get_temp_filepath("MetadataTest.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf::io::write_orc(out_opts);
+
+  auto meta = read_orc_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(meta.num_rows(), num_rows);
+
+  EXPECT_EQ(meta.schema().root().name(), "");
+  EXPECT_EQ(meta.schema().root().type_kind(), cudf::io::orc::STRUCT);
+  ASSERT_EQ(meta.schema().root().num_children(), 2);
+
+  EXPECT_EQ(meta.schema().root().child(0).name(), "int_col");
+  EXPECT_EQ(meta.schema().root().child(1).name(), "float_col");
+}
+
+TEST_F(OrcMetadataReaderTest, TestNested)
+{
+  auto const num_rows       = 1'200'000;
+  auto const lists_per_row  = 4;
+  auto const num_child_rows = num_rows * lists_per_row;
+
+  auto keys = random_values<int>(num_child_rows);
+  auto vals = random_values<float>(num_child_rows);
+  int32_col keys_col(keys.begin(), keys.end());
+  float32_col vals_col(vals.begin(), vals.end());
+  auto s_col = struct_col({keys_col, vals_col}).release();
+
+  std::vector<int> row_offsets(num_rows + 1);
+  for (int idx = 0; idx < num_rows + 1; ++idx) {
+    row_offsets[idx] = idx * lists_per_row;
+  }
+  int32_col offsets(row_offsets.begin(), row_offsets.end());
+
+  auto list_col =
+    cudf::make_lists_column(num_rows, offsets.release(), std::move(s_col), 0, rmm::device_buffer{});
+
+  table_view expected({*list_col, *list_col});
+
+  cudf::io::table_input_metadata expected_metadata(expected);
+  expected_metadata.column_metadata[0].set_name("maps");
+  expected_metadata.column_metadata[0].set_list_column_as_map();
+  expected_metadata.column_metadata[1].set_name("lists");
+  expected_metadata.column_metadata[1].child(1).child(0).set_name("int_field");
+  expected_metadata.column_metadata[1].child(1).child(1).set_name("float_field");
+
+  auto filepath = temp_env->get_temp_filepath("MetadataTest.orc");
+  cudf::io::orc_writer_options out_opts =
+    cudf::io::orc_writer_options::builder(cudf::io::sink_info{filepath}, expected)
+      .metadata(&expected_metadata);
+  cudf::io::write_orc(out_opts);
+
+  auto meta = read_orc_metadata(cudf::io::source_info{filepath});
+  EXPECT_EQ(meta.num_rows(), num_rows);
+
+  EXPECT_EQ(meta.schema().root().name(), "");
+  EXPECT_EQ(meta.schema().root().type_kind(), cudf::io::orc::STRUCT);
+  ASSERT_EQ(meta.schema().root().num_children(), 2);
+
+  auto const& out_map_col = meta.schema().root().child(0);
+  EXPECT_EQ(out_map_col.name(), "maps");
+  EXPECT_EQ(out_map_col.type_kind(), cudf::io::orc::MAP);
+  ASSERT_EQ(out_map_col.num_children(), 2);
+  EXPECT_EQ(out_map_col.child(0).name(), "");  // keys (no name in ORC)
+  EXPECT_EQ(out_map_col.child(1).name(), "");  // values (no name in ORC)
+
+  auto const& out_list_col = meta.schema().root().child(1);
+  EXPECT_EQ(out_list_col.name(), "lists");
+  EXPECT_EQ(out_list_col.type_kind(), cudf::io::orc::LIST);
+  ASSERT_EQ(out_list_col.num_children(), 1);
+
+  auto const& out_list_struct_col = out_list_col.child(0);
+  EXPECT_EQ(out_list_struct_col.name(), "");  // elements (no name in ORC)
+  EXPECT_EQ(out_list_struct_col.type_kind(), cudf::io::orc::STRUCT);
+  ASSERT_EQ(out_list_struct_col.num_children(), 2);
+
+  auto const& out_int_col = out_list_struct_col.child(0);
+  EXPECT_EQ(out_int_col.name(), "int_field");
+  EXPECT_EQ(out_int_col.type_kind(), cudf::io::orc::INT);
+
+  auto const& out_float_col = out_list_struct_col.child(1);
+  EXPECT_EQ(out_float_col.name(), "float_field");
+  EXPECT_EQ(out_float_col.type_kind(), cudf::io::orc::FLOAT);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/parquet_chunked_reader_test.cpp b/cpp/tests/io/parquet_chunked_reader_test.cpp
new file mode 100644
index 00000000000..76a65857e6f
--- /dev/null
+++ b/cpp/tests/io/parquet_chunked_reader_test.cpp
@@ -0,0 +1,887 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/cudf_gtest.hpp>
+#include <cudf_test/io_metadata_utilities.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
+
+#include <cudf/column/column.hpp>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/copying.hpp>
+#include <cudf/detail/iterator.cuh>
+#include <cudf/detail/structs/utilities.hpp>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/io/data_sink.hpp>
+#include <cudf/io/datasource.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/table/table.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/utilities/span.hpp>
+
+#include <src/io/parquet/compact_protocol_reader.hpp>
+#include <src/io/parquet/parquet.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+
+#include <rmm/cuda_stream_view.hpp>
+
+#include <fstream>
+#include <type_traits>
+
+namespace {
+// Global environment for temporary files
+auto const temp_env = static_cast<cudf::test::TempDirTestEnvironment*>(
+  ::testing::AddGlobalTestEnvironment(new cudf::test::TempDirTestEnvironment));
+
+using int32s_col       = cudf::test::fixed_width_column_wrapper<int32_t>;
+using int64s_col       = cudf::test::fixed_width_column_wrapper<int64_t>;
+using strings_col      = cudf::test::strings_column_wrapper;
+using structs_col      = cudf::test::structs_column_wrapper;
+using int32s_lists_col = cudf::test::lists_column_wrapper<int32_t>;
+
+auto write_file(std::vector<std::unique_ptr<cudf::column>>& input_columns,
+                std::string const& filename,
+                bool nullable,
+                std::size_t max_page_size_bytes = cudf::io::default_max_page_size_bytes,
+                std::size_t max_page_size_rows  = cudf::io::default_max_page_size_rows)
+{
+  // Just shift nulls of the next column by one position to avoid having all nulls in the same
+  // table rows.
+  if (nullable) {
+    // Generate deterministic bitmask instead of random bitmask for easy computation of data size.
+    auto const valid_iter = cudf::detail::make_counting_transform_iterator(
+      0, [](cudf::size_type i) { return i % 4 != 3; });
+
+    cudf::size_type offset{0};
+    for (auto& col : input_columns) {
+      auto const col_typeid = col->type().id();
+      col->set_null_mask(
+        cudf::test::detail::make_null_mask(valid_iter + offset, valid_iter + col->size() + offset));
+
+      if (col_typeid == cudf::type_id::STRUCT) {
+        auto const null_mask  = col->view().null_mask();
+        auto const null_count = col->null_count();
+
+        for (cudf::size_type idx = 0; idx < col->num_children(); ++idx) {
+          cudf::structs::detail::superimpose_parent_nulls(null_mask,
+                                                          null_count,
+                                                          col->child(idx),
+                                                          cudf::get_default_stream(),
+                                                          rmm::mr::get_current_device_resource());
+        }
+      }
+
+      if (col_typeid == cudf::type_id::LIST || col_typeid == cudf::type_id::STRUCT ||
+          col_typeid == cudf::type_id::STRING) {
+        col = cudf::purge_nonempty_nulls(col->view());
+      }
+    }
+  }
+
+  auto input_table = std::make_unique<cudf::table>(std::move(input_columns));
+  auto filepath =
+    temp_env->get_temp_filepath(nullable ? filename + "_nullable.parquet" : filename + ".parquet");
+
+  auto const write_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *input_table)
+      .max_page_size_bytes(max_page_size_bytes)
+      .max_page_size_rows(max_page_size_rows)
+      .build();
+  cudf::io::write_parquet(write_opts);
+
+  return std::pair{std::move(input_table), std::move(filepath)};
+}
+
+auto chunked_read(std::string const& filepath, std::size_t byte_limit)
+{
+  auto const read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).build();
+  auto reader = cudf::io::chunked_parquet_reader(byte_limit, read_opts);
+
+  auto num_chunks = 0;
+  auto out_tables = std::vector<std::unique_ptr<cudf::table>>{};
+
+  do {
+    auto chunk = reader.read_chunk();
+    // If the input file is empty, the first call to `read_chunk` will return an empty table.
+    // Thus, we only check for non-empty output table from the second call.
+    if (num_chunks > 0) {
+      CUDF_EXPECTS(chunk.tbl->num_rows() != 0, "Number of rows in the new chunk is zero.");
+    }
+    ++num_chunks;
+    out_tables.emplace_back(std::move(chunk.tbl));
+  } while (reader.has_next());
+
+  auto out_tviews = std::vector<cudf::table_view>{};
+  for (auto const& tbl : out_tables) {
+    out_tviews.emplace_back(tbl->view());
+  }
+
+  return std::pair(cudf::concatenate(out_tviews), num_chunks);
+}
+
+}  // namespace
+
+struct ParquetChunkedReaderTest : public cudf::test::BaseFixture {
+};
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadNoData)
+{
+  std::vector<std::unique_ptr<cudf::column>> input_columns;
+  input_columns.emplace_back(int32s_col{}.release());
+  input_columns.emplace_back(int64s_col{}.release());
+
+  auto const [expected, filepath] = write_file(input_columns, "chunked_read_empty", false);
+  auto const [result, num_chunks] = chunked_read(filepath, 1'000);
+  EXPECT_EQ(num_chunks, 1);
+  EXPECT_EQ(result->num_rows(), 0);
+  EXPECT_EQ(result->num_columns(), 2);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+}
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadSimpleData)
+{
+  auto constexpr num_rows = 40'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    input_columns.emplace_back(int64s_col(value_iter, value_iter + num_rows).release());
+
+    return write_file(input_columns, "chunked_read_simple", nullable);
+  };
+
+  {
+    auto const [expected, filepath] = generate_input(false);
+    auto const [result, num_chunks] = chunked_read(filepath, 240'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  {
+    auto const [expected, filepath] = generate_input(true);
+    auto const [result, num_chunks] = chunked_read(filepath, 240'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadBoundaryCases)
+{
+  // Tests some specific boundary conditions in the split calculations.
+
+  auto constexpr num_rows = 40'000;
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+    return write_file(input_columns, "chunked_read_simple_boundary", false /*nullable*/);
+  }();
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 1);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly less than one page of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 79'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size one page of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 80'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more the size one page of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 81'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly less than two pages of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 159'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two pages of data minus one byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 159'999);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit exactly the size of two pages of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 160'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a limit slightly more the size two pages of data
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 161'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithString)
+{
+  auto constexpr num_rows = 60'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const value_iter = thrust::make_counting_iterator(0);
+
+    // ints                                            Page    total bytes   cumulative bytes
+    // 20000 rows of 4 bytes each                    = A0      80000         80000
+    // 20000 rows of 4 bytes each                    = A1      80000         160000
+    // 20000 rows of 4 bytes each                    = A2      80000         240000
+    input_columns.emplace_back(int32s_col(value_iter, value_iter + num_rows).release());
+
+    // strings                                         Page    total bytes   cumulative bytes
+    // 20000 rows of 1 char each    (20000  + 80004) = B0      100004        100004
+    // 20000 rows of 4 chars each   (80000  + 80004) = B1      160004        260008
+    // 20000 rows of 16 chars each  (320000 + 80004) = B2      400004        660012
+    auto const strings  = std::vector<std::string>{"a", "bbbb", "cccccccccccccccc"};
+    auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+      if (i < 20000) { return strings[0]; }
+      if (i < 40000) { return strings[1]; }
+      return strings[2];
+    });
+    input_columns.emplace_back(strings_col(str_iter, str_iter + num_rows).release());
+
+    // Cumulative sizes:
+    // A0 + B0 :  180004
+    // A1 + B1 :  420008
+    // A2 + B2 :  900012
+    //                                    skip_rows / num_rows
+    // byte_limit==500000  should give 2 chunks: {0, 40000}, {40000, 20000}
+    // byte_limit==1000000 should give 1 chunks: {0, 60000},
+    return write_file(input_columns,
+                      "chunked_read_with_strings",
+                      nullable,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 500'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 500'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithStructs)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_rows);
+      auto child2 = int32s_col(int_iter + num_rows, int_iter + num_rows * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_rows};
+
+      return structs_col{{child1, child2, child3}}.release();
+    }());
+
+    return write_file(input_columns,
+                      "chunked_read_with_structs",
+                      nullable,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 500'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 500'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithListsNoNulls)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 page consist of:
+    //
+    // 20001 offsets :   80004  bytes
+    // 30000 ints    :   120000 bytes
+    // total         :   200004 bytes
+    auto const template_lists = int32s_lists_col{
+      int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{1, 2}, int32s_lists_col{3, 4, 5}};
+
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns,
+                      "chunked_read_with_lists_no_null",
+                      false /*nullable*/,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  }();
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size slightly less than 1 page (forcing it to be at least 1 page per read)
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 200'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size exactly 1 page
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 200'004);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size 2 pages. 3 chunks (2 pages + 2 pages + 1 page)
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 400'008);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size 2 pages minus one byte: each chunk will be just one page
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 400'007);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithListsHavingNulls)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const [expected, filepath] = [num_rows]() {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    // 20000 rows in 1 page consist of:
+    //
+    // 625 validity words :   2500 bytes   (a null every 4 rows: null at indices [3, 7, 11, ...])
+    // 20001 offsets      :   80004  bytes
+    // 15000 ints         :   60000 bytes
+    // total              :   142504 bytes
+    auto const template_lists =
+      int32s_lists_col{// these will all be null
+                       int32s_lists_col{},
+                       int32s_lists_col{0},
+                       int32s_lists_col{1, 2},
+                       int32s_lists_col{3, 4, 5, 6, 7, 8, 9} /* this list will be nullified out */};
+    auto const gather_iter =
+      cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+    auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+    input_columns.emplace_back(
+      std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+    return write_file(input_columns,
+                      "chunked_read_with_lists_nulls",
+                      true /*nullable*/,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  }();
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size slightly less than 1 page (forcing it to be at least 1 page per read)
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 142'500);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size exactly 1 page
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 142'504);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size 2 pages. 3 chunks (2 pages + 2 pages + 1 page)
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 285'008);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+
+  // chunk size 2 pages minus 1 byte: each chunk will be just one page
+  {
+    auto const [result, num_chunks] = chunked_read(filepath, 285'007);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected, *result);
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithStructsOfLists)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+    input_columns.emplace_back([=] {
+      std::vector<std::unique_ptr<cudf::column>> child_columns;
+      child_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+      child_columns.emplace_back(
+        int32s_col(int_iter + num_rows, int_iter + num_rows * 2).release());
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) {
+        return std::to_string(i) + "++++++++++++++++++++" + std::to_string(i);
+      });
+      child_columns.emplace_back(strings_col{str_iter, str_iter + num_rows}.release());
+
+      auto const template_lists = int32s_lists_col{
+        int32s_lists_col{}, int32s_lists_col{0}, int32s_lists_col{0, 1}, int32s_lists_col{0, 1, 2}};
+      auto const gather_iter =
+        cudf::detail::make_counting_transform_iterator(0, [&](int32_t i) { return i % 4; });
+      auto const gather_map = int32s_col(gather_iter, gather_iter + num_rows);
+      child_columns.emplace_back(
+        std::move(cudf::gather(cudf::table_view{{template_lists}}, gather_map)->release().front()));
+
+      return structs_col(std::move(child_columns)).release();
+    }());
+
+    return write_file(input_columns,
+                      "chunked_read_with_structs_of_lists",
+                      nullable,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Other tests:
+
+  // for these tests, different columns get written to different numbers of pages so it's a
+  // little tricky to describe the expected results by page counts. To get an idea of how
+  // these values are chosen, see the debug output from the call to print_cumulative_row_info() in
+  // reader_impl_preprocess.cu -> find_splits()
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000);
+    EXPECT_EQ(num_chunks, 7);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'500'000);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2'000'000);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 5'000'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2'000'000);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 5'000'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
+
+TEST_F(ParquetChunkedReaderTest, TestChunkedReadWithListsOfStructs)
+{
+  auto constexpr num_rows = 100'000;
+
+  auto const generate_input = [num_rows](bool nullable) {
+    std::vector<std::unique_ptr<cudf::column>> input_columns;
+    auto const int_iter = thrust::make_counting_iterator(0);
+    input_columns.emplace_back(int32s_col(int_iter, int_iter + num_rows).release());
+
+    auto offsets = std::vector<cudf::size_type>{};
+    offsets.reserve(num_rows * 2);
+    cudf::size_type num_structs = 0;
+    for (int i = 0; i < num_rows; ++i) {
+      offsets.push_back(num_structs);
+      auto const new_list_size = i % 4;
+      num_structs += new_list_size;
+    }
+    offsets.push_back(num_structs);
+
+    auto const make_structs_col = [=] {
+      auto child1 = int32s_col(int_iter, int_iter + num_structs);
+      auto child2 = int32s_col(int_iter + num_structs, int_iter + num_structs * 2);
+
+      auto const str_iter = cudf::detail::make_counting_transform_iterator(
+        0, [&](int32_t i) { return std::to_string(i) + std::to_string(i) + std::to_string(i); });
+      auto child3 = strings_col{str_iter, str_iter + num_structs};
+
+      return structs_col{{child1, child2, child3}}.release();
+    };
+
+    input_columns.emplace_back(
+      cudf::make_lists_column(static_cast<cudf::size_type>(offsets.size() - 1),
+                              int32s_col(offsets.begin(), offsets.end()).release(),
+                              make_structs_col(),
+                              0,
+                              rmm::device_buffer{}));
+
+    return write_file(input_columns,
+                      "chunked_read_with_lists_of_structs",
+                      nullable,
+                      512 * 1024,  // 512KB per page
+                      20000        // 20k rows per page
+    );
+  };
+
+  auto const [expected_no_null, filepath_no_null]       = generate_input(false);
+  auto const [expected_with_nulls, filepath_with_nulls] = generate_input(true);
+
+  // Test with zero limit: everything will be read in one chunk
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 0);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very small limit: 1 byte
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1);
+    EXPECT_EQ(num_chunks, 10);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // Test with a very large limit
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2L << 40);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  // for these tests, different columns get written to different numbers of pages so it's a
+  // little tricky to describe the expected results by page counts. To get an idea of how
+  // these values are chosen, see the debug output from the call to print_cumulative_row_info() in
+  // reader_impl_preprocess.cu -> find_splits()
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'000'000);
+    EXPECT_EQ(num_chunks, 7);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 1'500'000);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 2'000'000);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_no_null, 5'000'000);
+    EXPECT_EQ(num_chunks, 2);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_no_null, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'000'000);
+    EXPECT_EQ(num_chunks, 5);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 1'500'000);
+    EXPECT_EQ(num_chunks, 4);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 2'000'000);
+    EXPECT_EQ(num_chunks, 3);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+
+  {
+    auto const [result, num_chunks] = chunked_read(filepath_with_nulls, 5'000'000);
+    EXPECT_EQ(num_chunks, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*expected_with_nulls, *result);
+  }
+}
diff --git a/cpp/tests/io/parquet_test.cpp b/cpp/tests/io/parquet_test.cpp
index cf22ab8a525..2f59a740454 100644
--- a/cpp/tests/io/parquet_test.cpp
+++ b/cpp/tests/io/parquet_test.cpp
@@ -33,6 +33,7 @@
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 #include <cudf/table/table_view.hpp>
+#include <cudf/unary.hpp>
 #include <cudf/utilities/span.hpp>
 
 #include <src/io/parquet/compact_protocol_reader.hpp>
@@ -43,10 +44,9 @@
 #include <thrust/iterator/counting_iterator.h>
 
 #include <fstream>
+#include <random>
 #include <type_traits>
 
-namespace cudf_io = cudf::io;
-
 template <typename T, typename SourceElementT = T>
 using column_wrapper =
   typename std::conditional<std::is_same_v<T, cudf::string_view>,
@@ -114,7 +114,7 @@ std::unique_ptr<cudf::table> create_compressible_fixed_table(cudf::size_type num
 // this function replicates the "list_gen" function in
 // python/cudf/cudf/tests/test_parquet.py
 template <typename T>
-std::unique_ptr<cudf::column> make_parquet_list_col(
+std::unique_ptr<cudf::column> make_parquet_list_list_col(
   int skip_rows, int num_rows, int lists_per_row, int list_size, bool include_validity)
 {
   auto valids =
@@ -430,13 +430,13 @@ TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumn)
   auto expected = table_view{{col}};
 
   auto filepath = temp_env->get_temp_filepath("SingleColumn.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(in_opts);
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
@@ -453,18 +453,81 @@ TYPED_TEST(ParquetWriterNumericTypeTest, SingleColumnWithNulls)
   auto expected = table_view{{col}};
 
   auto filepath = temp_env->get_temp_filepath("SingleColumnWithNulls.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(in_opts);
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
-TYPED_TEST(ParquetWriterChronoTypeTest, Chronos)
+template <typename mask_op_t>
+void test_durations(mask_op_t mask_op)
+{
+  std::default_random_engine generator;
+  std::uniform_int_distribution<int> distribution_d(0, 30);
+  auto sequence_d = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return distribution_d(generator); });
+
+  std::uniform_int_distribution<int> distribution_s(0, 86400);
+  auto sequence_s = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return distribution_s(generator); });
+
+  std::uniform_int_distribution<int> distribution(0, 86400 * 1000);
+  auto sequence = cudf::detail::make_counting_transform_iterator(
+    0, [&](auto i) { return distribution(generator); });
+
+  auto mask = cudf::detail::make_counting_transform_iterator(0, mask_op);
+
+  constexpr auto num_rows = 100;
+  // Durations longer than a day are not exactly valid, but cudf should be able to round trip
+  auto durations_d = cudf::test::fixed_width_column_wrapper<cudf::duration_D, int64_t>(
+    sequence_d, sequence_d + num_rows, mask);
+  auto durations_s = cudf::test::fixed_width_column_wrapper<cudf::duration_s, int64_t>(
+    sequence_s, sequence_s + num_rows, mask);
+  auto durations_ms = cudf::test::fixed_width_column_wrapper<cudf::duration_ms, int64_t>(
+    sequence, sequence + num_rows, mask);
+  auto durations_us = cudf::test::fixed_width_column_wrapper<cudf::duration_us, int64_t>(
+    sequence, sequence + num_rows, mask);
+  auto durations_ns = cudf::test::fixed_width_column_wrapper<cudf::duration_ns, int64_t>(
+    sequence, sequence + num_rows, mask);
+
+  auto expected = table_view{{durations_d, durations_s, durations_ms, durations_us, durations_ns}};
+
+  auto filepath = temp_env->get_temp_filepath("Durations.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_parquet(out_opts);
+
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
+
+  auto durations_d_got =
+    cudf::cast(result.tbl->view().column(0), cudf::data_type{cudf::type_id::DURATION_DAYS});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_d, durations_d_got->view());
+
+  auto durations_s_got =
+    cudf::cast(result.tbl->view().column(1), cudf::data_type{cudf::type_id::DURATION_SECONDS});
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_s, durations_s_got->view());
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ms, result.tbl->view().column(2));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_us, result.tbl->view().column(3));
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(durations_ns, result.tbl->view().column(4));
+}
+
+TEST_F(ParquetWriterTest, Durations)
+{
+  test_durations([](auto i) { return true; });
+  test_durations([](auto i) { return (i % 2) != 0; });
+  test_durations([](auto i) { return (i % 3) != 0; });
+  test_durations([](auto i) { return false; });
+}
+
+TYPED_TEST(ParquetWriterTimestampTypeTest, Timestamps)
 {
   auto sequence = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return ((std::rand() / 10000) * 1000); });
@@ -476,20 +539,20 @@ TYPED_TEST(ParquetWriterChronoTypeTest, Chronos)
 
   auto expected = table_view{{col}};
 
-  auto filepath = temp_env->get_temp_filepath("Chronos.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_parquet(out_opts);
+  auto filepath = temp_env->get_temp_filepath("Timestamps.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
       .timestamp_type(this->type());
-  auto result = cudf_io::read_parquet(in_opts);
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
 
-TYPED_TEST(ParquetWriterChronoTypeTest, ChronosWithNulls)
+TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampsWithNulls)
 {
   auto sequence = cudf::detail::make_counting_transform_iterator(
     0, [](auto i) { return ((std::rand() / 10000) * 1000); });
@@ -502,15 +565,15 @@ TYPED_TEST(ParquetWriterChronoTypeTest, ChronosWithNulls)
 
   auto expected = table_view{{col}};
 
-  auto filepath = temp_env->get_temp_filepath("ChronosWithNulls.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_parquet(out_opts);
+  auto filepath = temp_env->get_temp_filepath("TimestampsWithNulls.parquet");
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
       .timestamp_type(this->type());
-  auto result = cudf_io::read_parquet(in_opts);
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
@@ -527,14 +590,14 @@ TYPED_TEST(ParquetWriterTimestampTypeTest, TimestampOverflow)
   table_view expected({col});
 
   auto filepath = temp_env->get_temp_filepath("ParquetTimestampOverflow.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
       .timestamp_type(this->type());
-  auto result = cudf_io::read_parquet(in_opts);
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
@@ -576,7 +639,7 @@ TEST_F(ParquetWriterTest, MultiColumn)
 
   auto expected = table_view{{col1, col2, col3, col4, col5, col6, col7, col8}};
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   // expected_metadata.column_metadata[0].set_name( "bools");
   expected_metadata.column_metadata[0].set_name("int8s");
   expected_metadata.column_metadata[1].set_name("int16s");
@@ -588,14 +651,14 @@ TEST_F(ParquetWriterTest, MultiColumn)
   expected_metadata.column_metadata[7].set_name("decimal128s").set_decimal_precision(40);
 
   auto filepath = temp_env->get_temp_filepath("MultiColumn.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(in_opts);
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -647,7 +710,7 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
 
   auto expected = table_view{{/*col0, */ col1, col2, col3, col4, col5, col6, col7}};
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   // expected_metadata.column_names.emplace_back("bools");
   expected_metadata.column_metadata[0].set_name("int8s");
   expected_metadata.column_metadata[1].set_name("int16s");
@@ -658,15 +721,15 @@ TEST_F(ParquetWriterTest, MultiColumnWithNulls)
   expected_metadata.column_metadata[6].set_name("decimal64s").set_decimal_precision(20);
 
   auto filepath = temp_env->get_temp_filepath("MultiColumnWithNulls.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
 
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(in_opts);
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   // TODO: Need to be able to return metadata in tree form from reader so they can be compared.
@@ -691,20 +754,20 @@ TEST_F(ParquetWriterTest, Strings)
 
   auto expected = table_view{{col0, col1, col2}};
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("col_other");
   expected_metadata.column_metadata[1].set_name("col_string");
   expected_metadata.column_metadata[2].set_name("col_another");
 
   auto filepath = temp_env->get_temp_filepath("Strings.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(in_opts);
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -720,17 +783,17 @@ TEST_F(ParquetWriterTest, StringsAsBinary)
   column_wrapper<cudf::string_view> col0{ascii_strings.begin(), ascii_strings.end()};
   column_wrapper<cudf::string_view> col1{unicode_strings.begin(), unicode_strings.end()};
   column_wrapper<cudf::string_view> col2{ascii_strings.begin(), ascii_strings.end()};
-  cudf::test::lists_column_wrapper<int8_t> col3{{'M', 'o', 'n', 'd', 'a', 'y'},
-                                                {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
-                                                {'F', 'r', 'i', 'd', 'a', 'y'},
-                                                {'M', 'o', 'n', 'd', 'a', 'y'},
-                                                {'F', 'r', 'i', 'd', 'a', 'y'},
-                                                {'F', 'r', 'i', 'd', 'a', 'y'},
-                                                {'F', 'r', 'i', 'd', 'a', 'y'},
-                                                {'F', 'u', 'n', 'd', 'a', 'y'}};
-  cudf::test::lists_column_wrapper<int8_t> col4{
+  cudf::test::lists_column_wrapper<uint8_t> col3{{'M', 'o', 'n', 'd', 'a', 'y'},
+                                                 {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
+                                                 {'F', 'r', 'i', 'd', 'a', 'y'},
+                                                 {'M', 'o', 'n', 'd', 'a', 'y'},
+                                                 {'F', 'r', 'i', 'd', 'a', 'y'},
+                                                 {'F', 'r', 'i', 'd', 'a', 'y'},
+                                                 {'F', 'r', 'i', 'd', 'a', 'y'},
+                                                 {'F', 'u', 'n', 'd', 'a', 'y'}};
+  cudf::test::lists_column_wrapper<uint8_t> col4{
     {'M', 'o', 'n', 'd', 'a', 'y'},
-    {'W', -56, -123, 'd', 'n', -56, -123, 's', 'd', 'a', 'y'},
+    {'W', 200, 133, 'd', 'n', 200, 133, 's', 'd', 'a', 'y'},
     {'F', 'r', 'i', 'd', 'a', 'y'},
     {'M', 'o', 'n', 'd', 'a', 'y'},
     {'F', 'r', 'i', 'd', 'a', 'y'},
@@ -740,7 +803,7 @@ TEST_F(ParquetWriterTest, StringsAsBinary)
 
   auto write_tbl = table_view{{col0, col1, col2, col3, col4}};
 
-  cudf_io::table_input_metadata expected_metadata(write_tbl);
+  cudf::io::table_input_metadata expected_metadata(write_tbl);
   expected_metadata.column_metadata[0].set_name("col_single").set_output_as_binary(true);
   expected_metadata.column_metadata[1].set_name("col_string").set_output_as_binary(true);
   expected_metadata.column_metadata[2].set_name("col_another").set_output_as_binary(true);
@@ -748,20 +811,20 @@ TEST_F(ParquetWriterTest, StringsAsBinary)
   expected_metadata.column_metadata[4].set_name("col_binary");
 
   auto filepath = temp_env->get_temp_filepath("BinaryStrings.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, write_tbl)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, write_tbl)
       .metadata(&expected_metadata);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
       .set_column_schema(
-        {cudf_io::reader_column_schema().set_convert_binary_to_strings(false),
-         cudf_io::reader_column_schema().set_convert_binary_to_strings(false),
-         cudf_io::reader_column_schema().set_convert_binary_to_strings(false),
-         cudf_io::reader_column_schema().add_child(cudf_io::reader_column_schema()),
-         cudf_io::reader_column_schema().add_child(cudf_io::reader_column_schema())});
-  auto result   = cudf_io::read_parquet(in_opts);
+        {cudf::io::reader_column_schema().set_convert_binary_to_strings(false),
+         cudf::io::reader_column_schema().set_convert_binary_to_strings(false),
+         cudf::io::reader_column_schema().set_convert_binary_to_strings(false),
+         cudf::io::reader_column_schema().add_child(cudf::io::reader_column_schema()),
+         cudf::io::reader_column_schema().add_child(cudf::io::reader_column_schema())});
+  auto result   = cudf::io::read_parquet(in_opts);
   auto expected = table_view{{col3, col4, col3, col3, col4}};
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
@@ -855,7 +918,7 @@ TEST_F(ParquetWriterTest, SlicedTable)
   // auto expected_slice = expected;
   auto expected_slice = cudf::slice(expected, {2, static_cast<cudf::size_type>(num_rows) - 1});
 
-  cudf_io::table_input_metadata expected_metadata(expected_slice);
+  cudf::io::table_input_metadata expected_metadata(expected_slice);
   expected_metadata.column_metadata[0].set_name("col_other");
   expected_metadata.column_metadata[1].set_name("col_string");
   expected_metadata.column_metadata[2].set_name("col_another");
@@ -869,14 +932,14 @@ TEST_F(ParquetWriterTest, SlicedTable)
   expected_metadata.column_metadata[6].child(1).child(1).set_name("flats");
 
   auto filepath = temp_env->get_temp_filepath("SlicedTable.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected_slice)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected_slice)
       .metadata(&expected_metadata);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(in_opts);
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_slice, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -956,7 +1019,7 @@ TEST_F(ParquetWriterTest, ListColumn)
 
   table_view expected({col0, col1, col2, col3, /* col4, */ col5, col6, col7});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("col_list_int_0");
   expected_metadata.column_metadata[1].set_name("col_list_list_int_1");
   expected_metadata.column_metadata[2].set_name("col_list_list_int_nullable_2");
@@ -967,14 +1030,14 @@ TEST_F(ParquetWriterTest, ListColumn)
   expected_metadata.column_metadata[6].set_name("col_list_list_list_7");
 
   auto filepath = temp_env->get_temp_filepath("ListColumn.parquet");
-  auto out_opts = cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  auto out_opts = cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
                     .metadata(&expected_metadata)
-                    .compression(cudf_io::compression_type::NONE);
+                    .compression(cudf::io::compression_type::NONE);
 
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
-  auto in_opts = cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result  = cudf_io::read_parquet(in_opts);
+  auto in_opts = cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result  = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -999,7 +1062,7 @@ TEST_F(ParquetWriterTest, MultiIndex)
 
   auto expected = table_view{{col0, col1, col2, col3, col4}};
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("int8s");
   expected_metadata.column_metadata[1].set_name("int16s");
   expected_metadata.column_metadata[2].set_name("int32s");
@@ -1007,18 +1070,18 @@ TEST_F(ParquetWriterTest, MultiIndex)
   expected_metadata.column_metadata[4].set_name("doubles");
 
   auto filepath = temp_env->get_temp_filepath("MultiIndex.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata)
       .key_value_metadata(
         {{{"pandas", "\"index_columns\": [\"int8s\", \"int16s\"], \"column1\": [\"int32s\"]"}}});
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
       .use_pandas_metadata(true)
       .columns({"int32s", "floats", "doubles"});
-  auto result = cudf_io::read_parquet(in_opts);
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -1034,35 +1097,53 @@ TEST_F(ParquetWriterTest, HostBuffer)
 
   const auto expected = table_view{{col}};
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("col_other");
 
   std::vector<char> out_buffer;
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), expected)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer), expected)
       .metadata(&expected_metadata);
-  cudf_io::write_parquet(out_opts);
-  cudf_io::parquet_reader_options in_opts = cudf_io::parquet_reader_options::builder(
-    cudf_io::source_info(out_buffer.data(), out_buffer.size()));
-  const auto result = cudf_io::read_parquet(in_opts);
+  cudf::io::write_parquet(out_opts);
+  cudf::io::parquet_reader_options in_opts = cudf::io::parquet_reader_options::builder(
+    cudf::io::source_info(out_buffer.data(), out_buffer.size()));
+  const auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
 }
 
+TEST_F(ParquetWriterTest, ManyFragments)
+{
+  srand(31337);
+  auto const expected = create_random_fixed_table<int>(10, 6'000'000, false);
+
+  auto const filepath = temp_env->get_temp_filepath("ManyFragments.parquet");
+  cudf::io::parquet_writer_options const args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected)
+      .max_page_size_bytes(8 * 1024);
+  cudf::io::write_parquet(args);
+
+  cudf::io::parquet_reader_options const read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto const result = cudf::io::read_parquet(read_opts);
+
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
+}
+
 TEST_F(ParquetWriterTest, NonNullable)
 {
   srand(31337);
   auto expected = create_random_fixed_table<int>(9, 9, false);
 
   auto filepath = temp_env->get_temp_filepath("NonNullable.parquet");
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, *expected);
-  cudf_io::write_parquet(args);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
@@ -1095,13 +1176,13 @@ TEST_F(ParquetWriterTest, Struct)
   auto expected = table_view({*struct_2});
 
   auto filepath = temp_env->get_temp_filepath("Struct.parquet");
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
-  cudf_io::write_parquet(args);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options read_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath));
-  cudf_io::read_parquet(read_args);
+  cudf::io::parquet_reader_options read_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath));
+  cudf::io::read_parquet(read_args);
 }
 
 TEST_F(ParquetWriterTest, StructOfList)
@@ -1156,7 +1237,7 @@ TEST_F(ParquetWriterTest, StructOfList)
 
   auto expected = table_view({*struct_2});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("being");
   expected_metadata.column_metadata[0].child(0).set_name("human?");
   expected_metadata.column_metadata[0].child(1).set_name("particulars");
@@ -1166,14 +1247,14 @@ TEST_F(ParquetWriterTest, StructOfList)
   expected_metadata.column_metadata[0].child(1).child(3).set_name("flats");
 
   auto filepath = temp_env->get_temp_filepath("StructOfList.parquet");
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_parquet(args);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options read_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath));
-  const auto result = cudf_io::read_parquet(read_args);
+  cudf::io::parquet_reader_options read_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath));
+  const auto result = cudf::io::read_parquet(read_args);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -1213,7 +1294,7 @@ TEST_F(ParquetWriterTest, ListOfStruct)
 
   auto expected = table_view({*list_col});
 
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[0].set_name("family");
   expected_metadata.column_metadata[0].child(1).child(0).set_name("human?");
   expected_metadata.column_metadata[0].child(1).child(1).set_name("particulars");
@@ -1221,14 +1302,14 @@ TEST_F(ParquetWriterTest, ListOfStruct)
   expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age");
 
   auto filepath = temp_env->get_temp_filepath("ListOfStruct.parquet");
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&expected_metadata);
-  cudf_io::write_parquet(args);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options read_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath));
-  const auto result = cudf_io::read_parquet(read_args);
+  cudf::io::parquet_reader_options read_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath));
+  const auto result = cudf::io::read_parquet(read_args);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -1284,36 +1365,34 @@ TEST_F(ParquetWriterTest, CustomDataSink)
   auto filepath = temp_env->get_temp_filepath("CustomDataSink.parquet");
   custom_test_data_sink custom_sink(filepath);
 
-  namespace cudf_io = cudf::io;
-
   srand(31337);
   auto expected = create_random_fixed_table<int>(5, 10, false);
 
   // write out using the custom sink
   {
-    cudf_io::parquet_writer_options args =
-      cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&custom_sink}, *expected);
-    cudf_io::write_parquet(args);
+    cudf::io::parquet_writer_options args =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected);
+    cudf::io::write_parquet(args);
   }
 
   // write out using a memmapped sink
   std::vector<char> buf_sink;
   {
-    cudf_io::parquet_writer_options args =
-      cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&buf_sink}, *expected);
-    cudf_io::write_parquet(args);
+    cudf::io::parquet_writer_options args =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&buf_sink}, *expected);
+    cudf::io::write_parquet(args);
   }
 
   // read them back in and make sure everything matches
 
-  cudf_io::parquet_reader_options custom_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto custom_tbl = cudf_io::read_parquet(custom_args);
+  cudf::io::parquet_reader_options custom_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto custom_tbl = cudf::io::read_parquet(custom_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view());
 
-  cudf_io::parquet_reader_options buf_args = cudf_io::parquet_reader_options::builder(
-    cudf_io::source_info{buf_sink.data(), buf_sink.size()});
-  auto buf_tbl = cudf_io::read_parquet(buf_args);
+  cudf::io::parquet_reader_options buf_args = cudf::io::parquet_reader_options::builder(
+    cudf::io::source_info{buf_sink.data(), buf_sink.size()});
+  auto buf_tbl = cudf::io::read_parquet(buf_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(buf_tbl.tbl->view(), expected->view());
 }
 
@@ -1322,20 +1401,18 @@ TEST_F(ParquetWriterTest, DeviceWriteLargeishFile)
   auto filepath = temp_env->get_temp_filepath("DeviceWriteLargeishFile.parquet");
   custom_test_data_sink custom_sink(filepath);
 
-  namespace cudf_io = cudf::io;
-
   // exercises multiple rowgroups
   srand(31337);
   auto expected = create_random_fixed_table<int>(4, 4 * 1024 * 1024, false);
 
   // write out using the custom sink (which uses device writes)
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&custom_sink}, *expected);
-  cudf_io::write_parquet(args);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options custom_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto custom_tbl = cudf_io::read_parquet(custom_args);
+  cudf::io::parquet_reader_options custom_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto custom_tbl = cudf::io::read_parquet(custom_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view());
 }
 
@@ -1354,19 +1431,19 @@ TEST_F(ParquetWriterTest, PartitionedWrite)
   auto expected2 =
     cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows});
 
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(
-      cudf_io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(
+      cudf::io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
       .partitions({partition1, partition2})
-      .compression(cudf_io::compression_type::NONE);
-  cudf_io::write_parquet(args);
+      .compression(cudf::io::compression_type::NONE);
+  cudf::io::write_parquet(args);
 
-  auto result1 = cudf_io::read_parquet(
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1)));
+  auto result1 = cudf::io::read_parquet(
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath1)));
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view());
 
-  auto result2 = cudf_io::read_parquet(
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2)));
+  auto result2 = cudf::io::read_parquet(
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath2)));
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
 }
 
@@ -1385,19 +1462,19 @@ TEST_F(ParquetWriterTest, PartitionedWriteEmptyPartitions)
   auto expected2 =
     cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows});
 
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(
-      cudf_io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(
+      cudf::io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
       .partitions({partition1, partition2})
-      .compression(cudf_io::compression_type::NONE);
-  cudf_io::write_parquet(args);
+      .compression(cudf::io::compression_type::NONE);
+  cudf::io::write_parquet(args);
 
-  auto result1 = cudf_io::read_parquet(
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1)));
+  auto result1 = cudf::io::read_parquet(
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath1)));
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view());
 
-  auto result2 = cudf_io::read_parquet(
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2)));
+  auto result2 = cudf::io::read_parquet(
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath2)));
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
 }
 
@@ -1416,19 +1493,19 @@ TEST_F(ParquetWriterTest, PartitionedWriteEmptyColumns)
   auto expected2 =
     cudf::slice(*source, {partition2.start_row, partition2.start_row + partition2.num_rows});
 
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(
-      cudf_io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(
+      cudf::io::sink_info(std::vector<std::string>{filepath1, filepath2}), *source)
       .partitions({partition1, partition2})
-      .compression(cudf_io::compression_type::NONE);
-  cudf_io::write_parquet(args);
+      .compression(cudf::io::compression_type::NONE);
+  cudf::io::write_parquet(args);
 
-  auto result1 = cudf_io::read_parquet(
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath1)));
+  auto result1 = cudf::io::read_parquet(
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath1)));
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected1, result1.tbl->view());
 
-  auto result2 = cudf_io::read_parquet(
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath2)));
+  auto result2 = cudf::io::read_parquet(
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath2)));
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected2, result2.tbl->view());
 }
 
@@ -1439,9 +1516,9 @@ std::string create_parquet_file(int num_cols)
   auto const table = create_random_fixed_table<T>(num_cols, 10, true);
   auto const filepath =
     temp_env->get_temp_filepath(typeid(T).name() + std::to_string(num_cols) + ".parquet");
-  cudf_io::parquet_writer_options const out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, table->view());
-  cudf_io::write_parquet(out_opts);
+  cudf::io::parquet_writer_options const out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table->view());
+  cudf::io::write_parquet(out_opts);
   return filepath;
 }
 
@@ -1451,16 +1528,16 @@ TEST_F(ParquetWriterTest, MultipleMismatchedSources)
   {
     auto const float5file = create_parquet_file<float>(5);
     std::vector<std::string> files{int5file, float5file};
-    cudf_io::parquet_reader_options const read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{files});
-    EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
+    cudf::io::parquet_reader_options const read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{files});
+    EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
   }
   {
     auto const int10file = create_parquet_file<int>(10);
     std::vector<std::string> files{int5file, int10file};
-    cudf_io::parquet_reader_options const read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{files});
-    EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
+    cudf::io::parquet_reader_options const read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{files});
+    EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
   }
 }
 
@@ -1473,13 +1550,13 @@ TEST_F(ParquetWriterTest, Slice)
   cudf::table_view tbl{result};
 
   auto filepath = temp_env->get_temp_filepath("Slice.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto read_table = cudf_io::read_parquet(in_opts);
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto read_table = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(read_table.tbl->view(), tbl);
 }
@@ -1490,13 +1567,13 @@ TEST_F(ParquetChunkedWriterTest, SingleTable)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedSingle.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer(args).write(*table1);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer(args).write(*table1);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *table1);
 }
@@ -1510,13 +1587,13 @@ TEST_F(ParquetChunkedWriterTest, SimpleTable)
   auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedSimple.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
@@ -1530,14 +1607,14 @@ TEST_F(ParquetChunkedWriterTest, LargeTables)
   auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedLarge.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  auto md = cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2).close();
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  auto md = cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2).close();
   CUDF_EXPECTS(!md, "The return value should be null.");
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
@@ -1557,18 +1634,18 @@ TEST_F(ParquetChunkedWriterTest, ManyTables)
   auto expected = cudf::concatenate(table_views);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedManyTables.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer writer(args);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer writer(args);
   std::for_each(table_views.begin(), table_views.end(), [&writer](table_view const& tbl) {
     writer.write(tbl);
   });
   auto md = writer.close({"dummy/path"});
   CUDF_EXPECTS(md, "The returned metadata should not be null.");
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
@@ -1592,13 +1669,13 @@ TEST_F(ParquetChunkedWriterTest, Strings)
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedStrings.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer(args).write(tbl1).write(tbl2);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer(args).write(tbl1).write(tbl2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
@@ -1651,13 +1728,13 @@ TEST_F(ParquetChunkedWriterTest, ListColumn)
   auto expected = cudf::concatenate(std::vector<table_view>({tbl0, tbl1}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer(args).write(tbl0).write(tbl1);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer(args).write(tbl0).write(tbl1);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
@@ -1704,7 +1781,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct)
 
   auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
-  cudf_io::table_input_metadata expected_metadata(table_1);
+  cudf::io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("family");
   expected_metadata.column_metadata[0].child(1).set_nullability(false);
   expected_metadata.column_metadata[0].child(1).child(0).set_name("human?");
@@ -1713,14 +1790,14 @@ TEST_F(ParquetChunkedWriterTest, ListOfStruct)
   expected_metadata.column_metadata[0].child(1).child(1).child(1).set_name("age");
 
   auto filepath = temp_env->get_temp_filepath("ChunkedListOfStruct.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
   args.set_metadata(&expected_metadata);
-  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
+  cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -1795,7 +1872,7 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
 
   auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
-  cudf_io::table_input_metadata expected_metadata(table_1);
+  cudf::io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("family");
   expected_metadata.column_metadata[0].child(1).set_nullability(false);
   expected_metadata.column_metadata[0].child(1).child(0).set_name("human?");
@@ -1806,14 +1883,14 @@ TEST_F(ParquetChunkedWriterTest, ListOfStructOfStructOfListOfList)
   expected_metadata.column_metadata[0].child(1).child(1).child(3).set_name("flats");
 
   auto filepath = temp_env->get_temp_filepath("ListOfStructOfStructOfListOfList.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
   args.set_metadata(&expected_metadata);
-  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
+  cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -1831,9 +1908,9 @@ TEST_F(ParquetChunkedWriterTest, MismatchedTypes)
   auto table2 = create_random_fixed_table<float>(4, 4, true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedTypes.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer writer(args);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer writer(args);
   writer.write(*table1);
   EXPECT_THROW(writer.write(*table2), cudf::logic_error);
   writer.close();
@@ -1845,9 +1922,9 @@ TEST_F(ParquetChunkedWriterTest, ChunkedWriteAfterClosing)
   auto table = create_random_fixed_table<int>(4, 4, true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedWriteAfterClosing.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer writer(args);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer writer(args);
   writer.write(*table).close();
   EXPECT_THROW(writer.write(*table), cudf::logic_error);
 }
@@ -1858,14 +1935,14 @@ TEST_F(ParquetChunkedWriterTest, ReadingUnclosedFile)
   auto table = create_random_fixed_table<int>(4, 4, true);
 
   auto filepath = temp_env->get_temp_filepath("ReadingUnclosedFile.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer writer(args);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer writer(args);
   writer.write(*table);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
 }
 
 TEST_F(ParquetChunkedWriterTest, MismatchedStructure)
@@ -1875,9 +1952,9 @@ TEST_F(ParquetChunkedWriterTest, MismatchedStructure)
   auto table2 = create_random_fixed_table<float>(3, 4, true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedMismatchedStructure.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer writer(args);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer writer(args);
   writer.write(*table1);
   EXPECT_THROW(writer.write(*table2), cudf::logic_error);
   writer.close();
@@ -1915,9 +1992,9 @@ TEST_F(ParquetChunkedWriterTest, MismatchedStructureList)
   auto tbl1 = table_view({col01, col11});
 
   auto filepath = temp_env->get_temp_filepath("ChunkedLists.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer writer(args);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer writer(args);
   writer.write(tbl0);
   EXPECT_THROW(writer.write(tbl1), cudf::logic_error);
 }
@@ -1931,13 +2008,13 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullability)
   auto full_table = cudf::concatenate(std::vector<table_view>({*table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedNullable.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
@@ -1969,7 +2046,7 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct)
 
   auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
-  cudf_io::table_input_metadata expected_metadata(table_1);
+  cudf::io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("being");
   expected_metadata.column_metadata[0].child(0).set_name("human?");
   expected_metadata.column_metadata[0].child(1).set_name("particulars");
@@ -1977,14 +2054,14 @@ TEST_F(ParquetChunkedWriterTest, DifferentNullabilityStruct)
   expected_metadata.column_metadata[0].child(1).child(1).set_name("age");
 
   auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
   args.set_metadata(&expected_metadata);
-  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
+  cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUIVALENT(*result.tbl, *full_table);
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -2000,7 +2077,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullability)
 
   auto filepath = temp_env->get_temp_filepath("ChunkedNoNullable.parquet");
 
-  cudf_io::table_input_metadata metadata(*table1);
+  cudf::io::table_input_metadata metadata(*table1);
 
   // In the absence of prescribed per-column nullability in metadata, the writer assumes the worst
   // and considers all columns nullable. However cudf::concatenate will not force nulls in case no
@@ -2010,14 +2087,14 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullability)
     col_meta.set_nullability(false);
   }
 
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath})
       .metadata(&metadata);
-  cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2);
+  cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
@@ -2057,7 +2134,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
 
   auto full_table = cudf::concatenate(std::vector<table_view>({table1, table2}));
 
-  cudf_io::table_input_metadata metadata(table1);
+  cudf::io::table_input_metadata metadata(table1);
   metadata.column_metadata[0].set_nullability(true);  // List is nullable at first (root) level
   metadata.column_metadata[0].child(1).set_nullability(
     false);  // non-nullable at second (leaf) level
@@ -2065,14 +2142,14 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityList)
 
   auto filepath = temp_env->get_temp_filepath("ChunkedListNullable.parquet");
 
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath})
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath})
       .metadata(&metadata);
-  cudf_io::parquet_chunked_writer(args).write(table1).write(table2);
+  cudf::io::parquet_chunked_writer(args).write(table1).write(table2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
@@ -2102,7 +2179,7 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct)
 
   auto full_table = cudf::concatenate(std::vector<table_view>({table_1, table_2}));
 
-  cudf_io::table_input_metadata expected_metadata(table_1);
+  cudf::io::table_input_metadata expected_metadata(table_1);
   expected_metadata.column_metadata[0].set_name("being").set_nullability(false);
   expected_metadata.column_metadata[0].child(0).set_name("human?").set_nullability(false);
   expected_metadata.column_metadata[0].child(1).set_name("particulars");
@@ -2110,14 +2187,14 @@ TEST_F(ParquetChunkedWriterTest, ForcedNullabilityStruct)
   expected_metadata.column_metadata[0].child(1).child(1).set_name("age");
 
   auto filepath = temp_env->get_temp_filepath("ChunkedNullableStruct.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
   args.set_metadata(&expected_metadata);
-  cudf_io::parquet_chunked_writer(args).write(table_1).write(table_2);
+  cudf::io::parquet_chunked_writer(args).write(table_1).write(table_2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
   cudf::test::expect_metadata_equal(expected_metadata, result.metadata);
@@ -2132,16 +2209,16 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroups)
   auto full_table = cudf::concatenate(std::vector<table_view>({*table2, *table1, *table2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedRowGroups.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
   {
-    cudf_io::parquet_chunked_writer(args).write(*table1).write(*table2);
+    cudf::io::parquet_chunked_writer(args).write(*table1).write(*table2);
   }
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
       .row_groups({{1, 0, 1}});
-  auto result = cudf_io::read_parquet(read_opts);
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *full_table);
 }
@@ -2152,17 +2229,17 @@ TEST_F(ParquetChunkedWriterTest, ReadRowGroupsError)
   auto table1 = create_random_fixed_table<int>(5, 5, true);
 
   auto filepath = temp_env->get_temp_filepath("ChunkedRowGroupsError.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer(args).write(*table1);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer(args).write(*table1);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).row_groups({{0, 1}});
-  EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).row_groups({{0, 1}});
+  EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
   read_opts.set_row_groups({{-1}});
-  EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
+  EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
   read_opts.set_row_groups({{0}, {0}});
-  EXPECT_THROW(cudf_io::read_parquet(read_opts), cudf::logic_error);
+  EXPECT_THROW(cudf::io::read_parquet(read_opts), cudf::logic_error);
 }
 
 TEST_F(ParquetWriterTest, DecimalWrite)
@@ -2182,26 +2259,26 @@ TEST_F(ParquetWriterTest, DecimalWrite)
   auto table = table_view({col0, col1});
 
   auto filepath = temp_env->get_temp_filepath("DecimalWrite.parquet");
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, table);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, table);
 
-  cudf_io::table_input_metadata expected_metadata(table);
+  cudf::io::table_input_metadata expected_metadata(table);
 
   // verify failure if too small a precision is given
   expected_metadata.column_metadata[0].set_decimal_precision(7);
   expected_metadata.column_metadata[1].set_decimal_precision(1);
   args.set_metadata(&expected_metadata);
-  EXPECT_THROW(cudf_io::write_parquet(args), cudf::logic_error);
+  EXPECT_THROW(cudf::io::write_parquet(args), cudf::logic_error);
 
   // verify success if equal precision is given
   expected_metadata.column_metadata[0].set_decimal_precision(7);
   expected_metadata.column_metadata[1].set_decimal_precision(9);
   args.set_metadata(&expected_metadata);
-  cudf_io::write_parquet(args);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, table);
 }
@@ -2218,8 +2295,8 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize)
 
   bool mask[] = {false, true, true, true, true, true, true, true, true, true, true,
                  true,  true, true, true, true, true, true, true, true, true, true,
-                 true,  true, true, true, true, true, true, true, true};
 
+                 true,  true, true, true, true, true, true, true, true};
   T c1a[num_els];
   std::fill(c1a, c1a + num_els, static_cast<T>(5));
   T c1b[num_els];
@@ -2243,13 +2320,13 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize)
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer(args).write(tbl1).write(tbl2);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer(args).write(tbl1).write(tbl2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
@@ -2291,13 +2368,13 @@ TYPED_TEST(ParquetChunkedWriterNumericTypeTest, UnalignedSize2)
   auto expected = cudf::concatenate(std::vector<table_view>({tbl1, tbl2}));
 
   auto filepath = temp_env->get_temp_filepath("ChunkedUnalignedSize2.parquet");
-  cudf_io::chunked_parquet_writer_options args =
-    cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info{filepath});
-  cudf_io::parquet_chunked_writer(args).write(tbl1).write(tbl2);
+  cudf::io::chunked_parquet_writer_options args =
+    cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info{filepath});
+  cudf::io::parquet_chunked_writer(args).write(tbl1).write(tbl2);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_opts);
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, *expected);
 }
@@ -2350,20 +2427,18 @@ TEST_F(ParquetWriterStressTest, LargeTableWeakCompression)
   mm_buf.reserve(4 * 1024 * 1024 * 16);
   custom_test_memmap_sink<false> custom_sink(&mm_buf);
 
-  namespace cudf_io = cudf::io;
-
   // exercises multiple rowgroups
   srand(31337);
   auto expected = create_random_fixed_table<int>(16, 4 * 1024 * 1024, false);
 
   // write out using the custom sink (which uses device writes)
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&custom_sink}, *expected);
-  cudf_io::write_parquet(args);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options custom_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{mm_buf.data(), mm_buf.size()});
-  auto custom_tbl = cudf_io::read_parquet(custom_args);
+  cudf::io::parquet_reader_options custom_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()});
+  auto custom_tbl = cudf::io::read_parquet(custom_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view());
 }
 
@@ -2373,20 +2448,18 @@ TEST_F(ParquetWriterStressTest, LargeTableGoodCompression)
   mm_buf.reserve(4 * 1024 * 1024 * 16);
   custom_test_memmap_sink<false> custom_sink(&mm_buf);
 
-  namespace cudf_io = cudf::io;
-
   // exercises multiple rowgroups
   srand(31337);
   auto expected = create_compressible_fixed_table<int>(16, 4 * 1024 * 1024, 128 * 1024, false);
 
   // write out using the custom sink (which uses device writes)
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&custom_sink}, *expected);
-  cudf_io::write_parquet(args);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options custom_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{mm_buf.data(), mm_buf.size()});
-  auto custom_tbl = cudf_io::read_parquet(custom_args);
+  cudf::io::parquet_reader_options custom_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()});
+  auto custom_tbl = cudf::io::read_parquet(custom_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view());
 }
 
@@ -2396,20 +2469,18 @@ TEST_F(ParquetWriterStressTest, LargeTableWithValids)
   mm_buf.reserve(4 * 1024 * 1024 * 16);
   custom_test_memmap_sink<false> custom_sink(&mm_buf);
 
-  namespace cudf_io = cudf::io;
-
   // exercises multiple rowgroups
   srand(31337);
   auto expected = create_compressible_fixed_table<int>(16, 4 * 1024 * 1024, 6, true);
 
   // write out using the custom sink (which uses device writes)
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&custom_sink}, *expected);
-  cudf_io::write_parquet(args);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options custom_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{mm_buf.data(), mm_buf.size()});
-  auto custom_tbl = cudf_io::read_parquet(custom_args);
+  cudf::io::parquet_reader_options custom_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()});
+  auto custom_tbl = cudf::io::read_parquet(custom_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view());
 }
 
@@ -2419,20 +2490,18 @@ TEST_F(ParquetWriterStressTest, DeviceWriteLargeTableWeakCompression)
   mm_buf.reserve(4 * 1024 * 1024 * 16);
   custom_test_memmap_sink<true> custom_sink(&mm_buf);
 
-  namespace cudf_io = cudf::io;
-
   // exercises multiple rowgroups
   srand(31337);
   auto expected = create_random_fixed_table<int>(16, 4 * 1024 * 1024, false);
 
   // write out using the custom sink (which uses device writes)
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&custom_sink}, *expected);
-  cudf_io::write_parquet(args);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options custom_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{mm_buf.data(), mm_buf.size()});
-  auto custom_tbl = cudf_io::read_parquet(custom_args);
+  cudf::io::parquet_reader_options custom_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()});
+  auto custom_tbl = cudf::io::read_parquet(custom_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view());
 }
 
@@ -2442,20 +2511,18 @@ TEST_F(ParquetWriterStressTest, DeviceWriteLargeTableGoodCompression)
   mm_buf.reserve(4 * 1024 * 1024 * 16);
   custom_test_memmap_sink<true> custom_sink(&mm_buf);
 
-  namespace cudf_io = cudf::io;
-
   // exercises multiple rowgroups
   srand(31337);
   auto expected = create_compressible_fixed_table<int>(16, 4 * 1024 * 1024, 128 * 1024, false);
 
   // write out using the custom sink (which uses device writes)
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&custom_sink}, *expected);
-  cudf_io::write_parquet(args);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options custom_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{mm_buf.data(), mm_buf.size()});
-  auto custom_tbl = cudf_io::read_parquet(custom_args);
+  cudf::io::parquet_reader_options custom_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()});
+  auto custom_tbl = cudf::io::read_parquet(custom_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view());
 }
 
@@ -2465,20 +2532,18 @@ TEST_F(ParquetWriterStressTest, DeviceWriteLargeTableWithValids)
   mm_buf.reserve(4 * 1024 * 1024 * 16);
   custom_test_memmap_sink<true> custom_sink(&mm_buf);
 
-  namespace cudf_io = cudf::io;
-
   // exercises multiple rowgroups
   srand(31337);
   auto expected = create_compressible_fixed_table<int>(16, 4 * 1024 * 1024, 6, true);
 
   // write out using the custom sink (which uses device writes)
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&custom_sink}, *expected);
-  cudf_io::write_parquet(args);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&custom_sink}, *expected);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options custom_args =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{mm_buf.data(), mm_buf.size()});
-  auto custom_tbl = cudf_io::read_parquet(custom_args);
+  cudf::io::parquet_reader_options custom_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{mm_buf.data(), mm_buf.size()});
+  auto custom_tbl = cudf::io::read_parquet(custom_args);
   CUDF_TEST_EXPECT_TABLES_EQUAL(custom_tbl.tbl->view(), expected->view());
 }
 
@@ -2491,14 +2556,14 @@ TEST_F(ParquetReaderTest, UserBounds)
     auto expected = create_random_fixed_table<int>(4, 4, false);
 
     auto filepath = temp_env->get_temp_filepath("TooManyRows.parquet");
-    cudf_io::parquet_writer_options args =
-      cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, *expected);
-    cudf_io::write_parquet(args);
+    cudf::io::parquet_writer_options args =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected);
+    cudf::io::write_parquet(args);
 
     // attempt to read more rows than there actually are
-    cudf_io::parquet_reader_options read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).num_rows(16);
-    auto result = cudf_io::read_parquet(read_opts);
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).num_rows(16);
+    auto result = cudf::io::read_parquet(read_opts);
 
     // we should only get back 4 rows
     EXPECT_EQ(result.tbl->view().column(0).size(), 4);
@@ -2511,14 +2576,14 @@ TEST_F(ParquetReaderTest, UserBounds)
     auto expected = create_random_fixed_table<int>(4, 4, false);
 
     auto filepath = temp_env->get_temp_filepath("PastBounds.parquet");
-    cudf_io::parquet_writer_options args =
-      cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, *expected);
-    cudf_io::write_parquet(args);
+    cudf::io::parquet_writer_options args =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected);
+    cudf::io::write_parquet(args);
 
     // attempt to read more rows than there actually are
-    cudf_io::parquet_reader_options read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).skip_rows(4);
-    auto result = cudf_io::read_parquet(read_opts);
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).skip_rows(4);
+    auto result = cudf::io::read_parquet(read_opts);
 
     // we should get empty columns back
     EXPECT_EQ(result.tbl->view().num_columns(), 4);
@@ -2533,14 +2598,14 @@ TEST_F(ParquetReaderTest, UserBounds)
     auto expected = create_random_fixed_table<int>(4, 4, false);
 
     auto filepath = temp_env->get_temp_filepath("ZeroRows.parquet");
-    cudf_io::parquet_writer_options args =
-      cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, *expected);
-    cudf_io::write_parquet(args);
+    cudf::io::parquet_writer_options args =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected);
+    cudf::io::write_parquet(args);
 
     // attempt to read more rows than there actually are
-    cudf_io::parquet_reader_options read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).num_rows(0);
-    auto result = cudf_io::read_parquet(read_opts);
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath}).num_rows(0);
+    auto result = cudf::io::read_parquet(read_opts);
 
     EXPECT_EQ(result.tbl->view().num_columns(), 4);
     EXPECT_EQ(result.tbl->view().column(0).size(), 0);
@@ -2553,16 +2618,16 @@ TEST_F(ParquetReaderTest, UserBounds)
     auto expected = create_random_fixed_table<int>(4, 4, false);
 
     auto filepath = temp_env->get_temp_filepath("ZeroRowsPastBounds.parquet");
-    cudf_io::parquet_writer_options args =
-      cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, *expected);
-    cudf_io::write_parquet(args);
+    cudf::io::parquet_writer_options args =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, *expected);
+    cudf::io::write_parquet(args);
 
     // attempt to read more rows than there actually are
-    cudf_io::parquet_reader_options read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
         .skip_rows(4)
         .num_rows(0);
-    auto result = cudf_io::read_parquet(read_opts);
+    auto result = cudf::io::read_parquet(read_opts);
 
     // we should get empty columns back
     EXPECT_EQ(result.tbl->view().num_columns(), 4);
@@ -2578,9 +2643,9 @@ TEST_F(ParquetReaderTest, UserBoundsWithNulls)
   // clang-format on
   cudf::table_view tbl({col});
   auto filepath = temp_env->get_temp_filepath("UserBoundsWithNulls.parquet");
-  cudf_io::parquet_writer_options out_args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
-  cudf_io::write_parquet(out_args);
+  cudf::io::parquet_writer_options out_args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+  cudf::io::write_parquet(out_args);
 
   // skip_rows / num_rows
   // clang-format off
@@ -2592,11 +2657,11 @@ TEST_F(ParquetReaderTest, UserBoundsWithNulls)
                                            {62, 2}, {63, 1}};
   // clang-format on
   for (auto p : params) {
-    cudf_io::parquet_reader_options read_args =
-      cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+    cudf::io::parquet_reader_options read_args =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
     if (p.first >= 0) { read_args.set_skip_rows(p.first); }
     if (p.second >= 0) { read_args.set_num_rows(p.second); }
-    auto result = cudf_io::read_parquet(read_args);
+    auto result = cudf::io::read_parquet(read_args);
 
     p.first  = p.first < 0 ? 0 : p.first;
     p.second = p.second < 0 ? static_cast<cudf::column_view>(col).size() - p.first : p.second;
@@ -2607,6 +2672,97 @@ TEST_F(ParquetReaderTest, UserBoundsWithNulls)
   }
 }
 
+TEST_F(ParquetReaderTest, UserBoundsWithNullsMixedTypes)
+{
+  constexpr int num_rows = 32 * 1024;
+
+  std::mt19937 gen(6542);
+  std::bernoulli_distribution bn(0.7f);
+  auto valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return bn(gen); });
+  auto values = thrust::make_counting_iterator(0);
+
+  // int64
+  cudf::test::fixed_width_column_wrapper<int64_t> c0(values, values + num_rows, valids);
+
+  // list<float>
+  constexpr int floats_per_row = 4;
+  auto c1_offset_iter          = cudf::detail::make_counting_transform_iterator(
+    0, [floats_per_row](cudf::size_type idx) { return idx * floats_per_row; });
+  cudf::test::fixed_width_column_wrapper<cudf::offset_type> c1_offsets(
+    c1_offset_iter, c1_offset_iter + num_rows + 1);
+  cudf::test::fixed_width_column_wrapper<float> c1_floats(
+    values, values + (num_rows * floats_per_row), valids);
+  auto _c1 = cudf::make_lists_column(num_rows,
+                                     c1_offsets.release(),
+                                     c1_floats.release(),
+                                     cudf::UNKNOWN_NULL_COUNT,
+                                     cudf::test::detail::make_null_mask(valids, valids + num_rows));
+  auto c1  = cudf::purge_nonempty_nulls(*_c1);
+
+  // list<list<int>>
+  auto c2 = make_parquet_list_list_col<int>(0, num_rows, 5, 8, true);
+
+  // struct<list<string>, int, float>
+  std::vector<std::string> strings{
+    "abc", "x", "bananas", "gpu", "minty", "backspace", "", "cayenne", "turbine", "soft"};
+  std::uniform_int_distribution<int> uni(0, strings.size() - 1);
+  auto string_iter = cudf::detail::make_counting_transform_iterator(
+    0, [&](cudf::size_type idx) { return strings[uni(gen)]; });
+  constexpr int string_per_row  = 3;
+  constexpr int num_string_rows = num_rows * string_per_row;
+  cudf::test::strings_column_wrapper string_col{string_iter, string_iter + num_string_rows};
+  auto offset_iter = cudf::detail::make_counting_transform_iterator(
+    0, [string_per_row](cudf::size_type idx) { return idx * string_per_row; });
+  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets(offset_iter,
+                                                                    offset_iter + num_rows + 1);
+
+  auto _c3_valids =
+    cudf::detail::make_counting_transform_iterator(0, [&](int index) { return index % 200; });
+  std::vector<bool> c3_valids(num_rows);
+  std::copy(_c3_valids, _c3_valids + num_rows, c3_valids.begin());
+  auto _c3_list =
+    cudf::make_lists_column(num_rows,
+                            offsets.release(),
+                            string_col.release(),
+                            cudf::UNKNOWN_NULL_COUNT,
+                            cudf::test::detail::make_null_mask(valids, valids + num_rows));
+  auto c3_list = cudf::purge_nonempty_nulls(*_c3_list);
+  cudf::test::fixed_width_column_wrapper<int> c3_ints(values, values + num_rows, valids);
+  cudf::test::fixed_width_column_wrapper<float> c3_floats(values, values + num_rows, valids);
+  std::vector<std::unique_ptr<cudf::column>> c3_children;
+  c3_children.push_back(std::move(c3_list));
+  c3_children.push_back(c3_ints.release());
+  c3_children.push_back(c3_floats.release());
+  cudf::test::structs_column_wrapper _c3(std::move(c3_children), c3_valids);
+  auto c3 = cudf::purge_nonempty_nulls(_c3);
+
+  // write it out
+  cudf::table_view tbl({c0, *c1, *c2, *c3});
+  auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsMixedTypes.parquet");
+  cudf::io::parquet_writer_options out_args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+  cudf::io::write_parquet(out_args);
+
+  // read it back
+  std::vector<std::pair<int, int>> params{
+    {-1, -1}, {0, num_rows}, {1, num_rows - 1}, {num_rows - 1, 1}, {517, 22000}};
+  for (auto p : params) {
+    cudf::io::parquet_reader_options read_args =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+    if (p.first >= 0) { read_args.set_skip_rows(p.first); }
+    if (p.second >= 0) { read_args.set_num_rows(p.second); }
+    auto result = cudf::io::read_parquet(read_args);
+
+    p.first  = p.first < 0 ? 0 : p.first;
+    p.second = p.second < 0 ? num_rows - p.first : p.second;
+    std::vector<cudf::size_type> slice_indices{p.first, p.first + p.second};
+    auto expected = cudf::slice(tbl, slice_indices);
+
+    CUDF_TEST_EXPECT_TABLES_EQUAL(*result.tbl, expected[0]);
+  }
+}
+
 TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
 {
   constexpr int num_rows = 30 * 1000000;
@@ -2622,9 +2778,9 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
   // this file will have row groups of 1,000,000 each
   cudf::table_view tbl({col});
   auto filepath = temp_env->get_temp_filepath("UserBoundsWithNullsLarge.parquet");
-  cudf_io::parquet_writer_options out_args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
-  cudf_io::write_parquet(out_args);
+  cudf::io::parquet_writer_options out_args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+  cudf::io::write_parquet(out_args);
 
   // skip_rows / num_rows
   // clang-format off
@@ -2636,11 +2792,11 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
                                            {24001231, 17}, {29000001, 989999}, {29999999, 1} };
   // clang-format on
   for (auto p : params) {
-    cudf_io::parquet_reader_options read_args =
-      cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+    cudf::io::parquet_reader_options read_args =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
     if (p.first >= 0) { read_args.set_skip_rows(p.first); }
     if (p.second >= 0) { read_args.set_num_rows(p.second); }
-    auto result = cudf_io::read_parquet(read_args);
+    auto result = cudf::io::read_parquet(read_args);
 
     p.first  = p.first < 0 ? 0 : p.first;
     p.second = p.second < 0 ? static_cast<cudf::column_view>(col).size() - p.first : p.second;
@@ -2654,15 +2810,15 @@ TEST_F(ParquetReaderTest, UserBoundsWithNullsLarge)
 TEST_F(ParquetReaderTest, ListUserBoundsWithNullsLarge)
 {
   constexpr int num_rows = 5 * 1000000;
-  auto colp              = make_parquet_list_col<int>(0, num_rows, 5, 8, true);
+  auto colp              = make_parquet_list_list_col<int>(0, num_rows, 5, 8, true);
   cudf::column_view col  = *colp;
 
   // this file will have row groups of 1,000,000 each
   cudf::table_view tbl({col});
   auto filepath = temp_env->get_temp_filepath("ListUserBoundsWithNullsLarge.parquet");
-  cudf_io::parquet_writer_options out_args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl);
-  cudf_io::write_parquet(out_args);
+  cudf::io::parquet_writer_options out_args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl);
+  cudf::io::write_parquet(out_args);
 
   // skip_rows / num_rows
   // clang-format off
@@ -2674,11 +2830,11 @@ TEST_F(ParquetReaderTest, ListUserBoundsWithNullsLarge)
                                            {4001231, 17}, {1900000, 989999}, {4999999, 1} };
   // clang-format on
   for (auto p : params) {
-    cudf_io::parquet_reader_options read_args =
-      cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath});
+    cudf::io::parquet_reader_options read_args =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
     if (p.first >= 0) { read_args.set_skip_rows(p.first); }
     if (p.second >= 0) { read_args.set_num_rows(p.second); }
-    auto result = cudf_io::read_parquet(read_args);
+    auto result = cudf::io::read_parquet(read_args);
 
     p.first  = p.first < 0 ? 0 : p.first;
     p.second = p.second < 0 ? static_cast<cudf::column_view>(col).size() - p.first : p.second;
@@ -2697,17 +2853,18 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
     cudf::table_view tbl{{a, b}};
     auto filepath = temp_env->get_temp_filepath("ReorderedColumns.parquet");
-    cudf_io::table_input_metadata md(tbl);
+    cudf::io::table_input_metadata md(tbl);
     md.column_metadata[0].set_name("a");
     md.column_metadata[1].set_name("b");
-    cudf_io::parquet_writer_options opts =
-      cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl).metadata(&md);
-    cudf_io::write_parquet(opts);
+    cudf::io::parquet_writer_options opts =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(&md);
+    cudf::io::write_parquet(opts);
 
     // read them out of order
-    cudf_io::parquet_reader_options read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).columns({"b", "a"});
-    auto result = cudf_io::read_parquet(read_opts);
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+        .columns({"b", "a"});
+    auto result = cudf::io::read_parquet(read_opts);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), b);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), a);
@@ -2719,17 +2876,18 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
     cudf::table_view tbl{{a, b}};
     auto filepath = temp_env->get_temp_filepath("ReorderedColumns2.parquet");
-    cudf_io::table_input_metadata md(tbl);
+    cudf::io::table_input_metadata md(tbl);
     md.column_metadata[0].set_name("a");
     md.column_metadata[1].set_name("b");
-    cudf_io::parquet_writer_options opts =
-      cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl).metadata(&md);
-    cudf_io::write_parquet(opts);
+    cudf::io::parquet_writer_options opts =
+      cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(&md);
+    cudf::io::write_parquet(opts);
 
     // read them out of order
-    cudf_io::parquet_reader_options read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).columns({"b", "a"});
-    auto result = cudf_io::read_parquet(read_opts);
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+        .columns({"b", "a"});
+    auto result = cudf::io::read_parquet(read_opts);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), b);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), a);
@@ -2744,21 +2902,21 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
   cudf::table_view tbl{{a, b, c, d}};
   auto filepath = temp_env->get_temp_filepath("ReorderedColumns3.parquet");
-  cudf_io::table_input_metadata md(tbl);
+  cudf::io::table_input_metadata md(tbl);
   md.column_metadata[0].set_name("a");
   md.column_metadata[1].set_name("b");
   md.column_metadata[2].set_name("c");
   md.column_metadata[3].set_name("d");
-  cudf_io::parquet_writer_options opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, tbl).metadata(&md);
-  cudf_io::write_parquet(opts);
+  cudf::io::parquet_writer_options opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, tbl).metadata(&md);
+  cudf::io::write_parquet(opts);
 
   {
     // read them out of order
-    cudf_io::parquet_reader_options read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
         .columns({"d", "a", "b", "c"});
-    auto result = cudf_io::read_parquet(read_opts);
+    auto result = cudf::io::read_parquet(read_opts);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), d);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), a);
@@ -2768,10 +2926,10 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
   {
     // read them out of order
-    cudf_io::parquet_reader_options read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
         .columns({"c", "d", "a", "b"});
-    auto result = cudf_io::read_parquet(read_opts);
+    auto result = cudf::io::read_parquet(read_opts);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), c);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), d);
@@ -2781,10 +2939,10 @@ TEST_F(ParquetReaderTest, ReorderedColumns)
 
   {
     // read them out of order
-    cudf_io::parquet_reader_options read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
         .columns({"d", "c", "b", "a"});
-    auto result = cudf_io::read_parquet(read_opts);
+    auto result = cudf::io::read_parquet(read_opts);
 
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), d);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), c);
@@ -2818,7 +2976,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
 
   auto input = table_view({*struct_2});
 
-  cudf_io::table_input_metadata input_metadata(input);
+  cudf::io::table_input_metadata input_metadata(input);
   input_metadata.column_metadata[0].set_name("being");
   input_metadata.column_metadata[0].child(0).set_name("human?");
   input_metadata.column_metadata[0].child(1).set_name("particulars");
@@ -2826,16 +2984,16 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
   input_metadata.column_metadata[0].child(1).child(1).set_name("age");
 
   auto filepath = temp_env->get_temp_filepath("SelectNestedColumn.parquet");
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, input)
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, input)
       .metadata(&input_metadata);
-  cudf_io::write_parquet(args);
+  cudf::io::write_parquet(args);
 
   {  // Test selecting a single leaf from the table
-    cudf_io::parquet_reader_options read_args =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath))
+    cudf::io::parquet_reader_options read_args =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath))
         .columns({"being.particulars.age"});
-    const auto result = cudf_io::read_parquet(read_args);
+    const auto result = cudf::io::read_parquet(read_args);
 
     auto expect_ages_col = cudf::test::fixed_width_column_wrapper<int32_t>{
       {48, 27, 25, 31, 351, 351}, {1, 1, 1, 1, 1, 0}};
@@ -2844,7 +3002,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
       cudf::test::structs_column_wrapper{{expect_s_1}, {0, 1, 1, 1, 1, 1}}.release();
     auto expected = table_view({*expect_s_2});
 
-    cudf_io::table_input_metadata expected_metadata(expected);
+    cudf::io::table_input_metadata expected_metadata(expected);
     expected_metadata.column_metadata[0].set_name("being");
     expected_metadata.column_metadata[0].child(0).set_name("particulars");
     expected_metadata.column_metadata[0].child(0).child(0).set_name("age");
@@ -2854,10 +3012,10 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
   }
 
   {  // Test selecting a non-leaf and expecting all hierarchy from that node onwards
-    cudf_io::parquet_reader_options read_args =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath))
+    cudf::io::parquet_reader_options read_args =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath))
         .columns({"being.particulars"});
-    const auto result = cudf_io::read_parquet(read_args);
+    const auto result = cudf::io::read_parquet(read_args);
 
     auto expected_weights_col =
       cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
@@ -2872,7 +3030,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
       cudf::test::structs_column_wrapper{{expected_s_1}, {0, 1, 1, 1, 1, 1}}.release();
     auto expected = table_view({*expect_s_2});
 
-    cudf_io::table_input_metadata expected_metadata(expected);
+    cudf::io::table_input_metadata expected_metadata(expected);
     expected_metadata.column_metadata[0].set_name("being");
     expected_metadata.column_metadata[0].child(0).set_name("particulars");
     expected_metadata.column_metadata[0].child(0).child(0).set_name("weight");
@@ -2883,10 +3041,10 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
   }
 
   {  // Test selecting struct children out of order
-    cudf_io::parquet_reader_options read_args =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info(filepath))
+    cudf::io::parquet_reader_options read_args =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info(filepath))
         .columns({"being.particulars.age", "being.particulars.weight", "being.human?"});
-    const auto result = cudf_io::read_parquet(read_args);
+    const auto result = cudf::io::read_parquet(read_args);
 
     auto expected_weights_col =
       cudf::test::fixed_width_column_wrapper<float>{1.1, 2.4, 5.3, 8.0, 9.6, 6.9};
@@ -2906,7 +3064,7 @@ TEST_F(ParquetReaderTest, SelectNestedColumn)
 
     auto expected = table_view({*expect_s_2});
 
-    cudf_io::table_input_metadata expected_metadata(expected);
+    cudf::io::table_input_metadata expected_metadata(expected);
     expected_metadata.column_metadata[0].set_name("being");
     expected_metadata.column_metadata[0].child(0).set_name("particulars");
     expected_metadata.column_metadata[0].child(0).child(0).set_name("age");
@@ -3086,9 +3244,9 @@ TEST_F(ParquetReaderTest, DecimalRead)
       0x00, 0x00, 0x00, 0xd3, 0x02, 0x00, 0x00, 0x50, 0x41, 0x52, 0x31};
     unsigned int decimals_parquet_len = 2366;
 
-    cudf_io::parquet_reader_options read_opts = cudf_io::parquet_reader_options::builder(
-      cudf_io::source_info{reinterpret_cast<const char*>(decimals_parquet), decimals_parquet_len});
-    auto result = cudf_io::read_parquet(read_opts);
+    cudf::io::parquet_reader_options read_opts = cudf::io::parquet_reader_options::builder(
+      cudf::io::source_info{reinterpret_cast<const char*>(decimals_parquet), decimals_parquet_len});
+    auto result = cudf::io::read_parquet(read_opts);
 
     auto validity =
       cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i != 50; });
@@ -3134,9 +3292,9 @@ TEST_F(ParquetReaderTest, DecimalRead)
       std::begin(col1_data), std::end(col1_data), validity, numeric::scale_type{-5});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(1), col1);
 
-    cudf_io::parquet_reader_options read_strict_opts = read_opts;
+    cudf::io::parquet_reader_options read_strict_opts = read_opts;
     read_strict_opts.set_columns({"dec7p4", "dec14p5"});
-    EXPECT_NO_THROW(cudf_io::read_parquet(read_strict_opts));
+    EXPECT_NO_THROW(cudf::io::read_parquet(read_strict_opts));
   }
   {
     // dec7p3: Decimal(precision=7, scale=3) backed by FIXED_LENGTH_BYTE_ARRAY(length = 4)
@@ -3229,10 +3387,10 @@ TEST_F(ParquetReaderTest, DecimalRead)
 
     unsigned int parquet_len = 1226;
 
-    cudf_io::parquet_reader_options read_opts =
-      cudf_io::parquet_reader_options::builder(cudf_io::source_info{
+    cudf::io::parquet_reader_options read_opts =
+      cudf::io::parquet_reader_options::builder(cudf::io::source_info{
         reinterpret_cast<const char*>(fixed_len_bytes_decimal_parquet), parquet_len});
-    auto result = cudf_io::read_parquet(read_opts);
+    auto result = cudf::io::read_parquet(read_opts);
     EXPECT_EQ(result.tbl->view().num_columns(), 3);
 
     auto validity_c0    = cudf::test::iterators::nulls_at({19});
@@ -3324,18 +3482,18 @@ TEST_F(ParquetReaderTest, EmptyOutput)
   table_view expected({c0, c1, c2, *c3, c4});
 
   // set precision on the decimal column
-  cudf_io::table_input_metadata expected_metadata(expected);
+  cudf::io::table_input_metadata expected_metadata(expected);
   expected_metadata.column_metadata[2].set_decimal_precision(1);
 
   auto filepath = temp_env->get_temp_filepath("EmptyOutput.parquet");
-  cudf_io::parquet_writer_options out_args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected);
+  cudf::io::parquet_writer_options out_args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected);
   out_args.set_metadata(&expected_metadata);
-  cudf_io::write_parquet(out_args);
+  cudf::io::write_parquet(out_args);
 
-  cudf_io::parquet_reader_options read_args =
-    cudf::io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  auto result = cudf_io::read_parquet(read_args);
+  cudf::io::parquet_reader_options read_args =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  auto result = cudf::io::read_parquet(read_args);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
@@ -3345,33 +3503,33 @@ TEST_F(ParquetWriterTest, RowGroupSizeInvalid)
   const auto unused_table = std::make_unique<table>();
   std::vector<char> out_buffer;
 
-  EXPECT_THROW(
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
-      .row_group_size_rows(4999),
-    cudf::logic_error);
-  EXPECT_THROW(
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
-      .max_page_size_rows(4999),
-    cudf::logic_error);
-  EXPECT_THROW(
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
-      .row_group_size_bytes(3 << 10),
-    cudf::logic_error);
-  EXPECT_THROW(
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
-      .max_page_size_bytes(3 << 10),
-    cudf::logic_error);
-
-  EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+  EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer),
+                                                         unused_table->view())
+                 .row_group_size_rows(4999),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer),
+                                                         unused_table->view())
+                 .max_page_size_rows(4999),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer),
+                                                         unused_table->view())
+                 .row_group_size_bytes(3 << 10),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer),
+                                                         unused_table->view())
+                 .max_page_size_bytes(3 << 10),
+               cudf::logic_error);
+
+  EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer))
                  .row_group_size_rows(4999),
                cudf::logic_error);
-  EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+  EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer))
                  .max_page_size_rows(4999),
                cudf::logic_error);
-  EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+  EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer))
                  .row_group_size_bytes(3 << 10),
                cudf::logic_error);
-  EXPECT_THROW(cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+  EXPECT_THROW(cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer))
                  .max_page_size_bytes(3 << 10),
                cudf::logic_error);
 }
@@ -3381,13 +3539,13 @@ TEST_F(ParquetWriterTest, RowGroupPageSizeMatch)
   const auto unused_table = std::make_unique<table>();
   std::vector<char> out_buffer;
 
-  auto options =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info(&out_buffer), unused_table->view())
-      .row_group_size_bytes(128 * 1024)
-      .max_page_size_bytes(512 * 1024)
-      .row_group_size_rows(10000)
-      .max_page_size_rows(20000)
-      .build();
+  auto options = cudf::io::parquet_writer_options::builder(cudf::io::sink_info(&out_buffer),
+                                                           unused_table->view())
+                   .row_group_size_bytes(128 * 1024)
+                   .max_page_size_bytes(512 * 1024)
+                   .row_group_size_rows(10000)
+                   .max_page_size_rows(20000)
+                   .build();
   EXPECT_EQ(options.get_row_group_size_bytes(), options.get_max_page_size_bytes());
   EXPECT_EQ(options.get_row_group_size_rows(), options.get_max_page_size_rows());
 }
@@ -3396,7 +3554,7 @@ TEST_F(ParquetChunkedWriterTest, RowGroupPageSizeMatch)
 {
   std::vector<char> out_buffer;
 
-  auto options = cudf_io::chunked_parquet_writer_options::builder(cudf_io::sink_info(&out_buffer))
+  auto options = cudf::io::chunked_parquet_writer_options::builder(cudf::io::sink_info(&out_buffer))
                    .row_group_size_bytes(128 * 1024)
                    .max_page_size_bytes(512 * 1024)
                    .row_group_size_rows(10000)
@@ -3420,7 +3578,7 @@ TEST_F(ParquetWriterTest, EmptyList)
   cudf::io::write_parquet(cudf::io::parquet_writer_options_builder(cudf::io::sink_info(filepath),
                                                                    cudf::table_view({*L0})));
 
-  auto result = cudf_io::read_parquet(
+  auto result = cudf::io::read_parquet(
     cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath)));
 
   using lcw     = cudf::test::lists_column_wrapper<int64_t>;
@@ -3447,7 +3605,7 @@ TEST_F(ParquetWriterTest, DeepEmptyList)
   cudf::io::write_parquet(cudf::io::parquet_writer_options_builder(cudf::io::sink_info(filepath),
                                                                    cudf::table_view({*L0})));
 
-  auto result = cudf_io::read_parquet(
+  auto result = cudf::io::read_parquet(
     cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath)));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), *L0);
@@ -3472,7 +3630,7 @@ TEST_F(ParquetWriterTest, EmptyListWithStruct)
   auto filepath = temp_env->get_temp_filepath("EmptyListWithStruct.parquet");
   cudf::io::write_parquet(cudf::io::parquet_writer_options_builder(cudf::io::sink_info(filepath),
                                                                    cudf::table_view({*L0})));
-  auto result = cudf_io::read_parquet(
+  auto result = cudf::io::read_parquet(
     cudf::io::parquet_reader_options_builder(cudf::io::source_info(filepath)));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result.tbl->view().column(0), *L0);
@@ -4362,15 +4520,15 @@ TEST_F(ParquetReaderTest, EmptyColumnsParam)
   auto const expected = create_random_fixed_table<int>(2, 4, false);
 
   std::vector<char> out_buffer;
-  cudf_io::parquet_writer_options args =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{&out_buffer}, *expected);
-  cudf_io::write_parquet(args);
+  cudf::io::parquet_writer_options args =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{&out_buffer}, *expected);
+  cudf::io::write_parquet(args);
 
-  cudf_io::parquet_reader_options read_opts =
-    cudf_io::parquet_reader_options::builder(
-      cudf_io::source_info{out_buffer.data(), out_buffer.size()})
+  cudf::io::parquet_reader_options read_opts =
+    cudf::io::parquet_reader_options::builder(
+      cudf::io::source_info{out_buffer.data(), out_buffer.size()})
       .columns({});
-  auto const result = cudf_io::read_parquet(read_opts);
+  auto const result = cudf::io::read_parquet(read_opts);
 
   EXPECT_EQ(result.tbl->num_columns(), 0);
   EXPECT_EQ(result.tbl->num_rows(), 0);
@@ -4384,13 +4542,13 @@ TEST_F(ParquetReaderTest, BinaryAsStrings)
 
   auto seq_col0 = random_values<int>(num_rows);
   auto seq_col2 = random_values<float>(num_rows);
-  auto seq_col3 = random_values<int8_t>(num_rows);
+  auto seq_col3 = random_values<uint8_t>(num_rows);
   auto validity = cudf::test::iterators::no_nulls();
 
   column_wrapper<int> int_col{seq_col0.begin(), seq_col0.end(), validity};
   column_wrapper<cudf::string_view> string_col{strings.begin(), strings.end()};
   column_wrapper<float> float_col{seq_col2.begin(), seq_col2.end(), validity};
-  cudf::test::lists_column_wrapper<int8_t> list_int_col{
+  cudf::test::lists_column_wrapper<uint8_t> list_int_col{
     {'M', 'o', 'n', 'd', 'a', 'y'},
     {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
     {'F', 'r', 'i', 'd', 'a', 'y'},
@@ -4401,7 +4559,7 @@ TEST_F(ParquetReaderTest, BinaryAsStrings)
     {'F', 'u', 'n', 'd', 'a', 'y'}};
 
   auto output = table_view{{int_col, string_col, float_col, string_col, list_int_col}};
-  cudf_io::table_input_metadata output_metadata(output);
+  cudf::io::table_input_metadata output_metadata(output);
   output_metadata.column_metadata[0].set_name("col_other");
   output_metadata.column_metadata[1].set_name("col_string");
   output_metadata.column_metadata[2].set_name("col_float");
@@ -4409,37 +4567,38 @@ TEST_F(ParquetReaderTest, BinaryAsStrings)
   output_metadata.column_metadata[4].set_name("col_binary").set_output_as_binary(true);
 
   auto filepath = temp_env->get_temp_filepath("BinaryReadStrings.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, output)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, output)
       .metadata(&output_metadata);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
   auto expected_string = table_view{{int_col, string_col, float_col, string_col, string_col}};
   auto expected_mixed  = table_view{{int_col, string_col, float_col, list_int_col, list_int_col}};
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
       .set_column_schema({{}, {}, {}, {}, {}});
-  auto result = cudf_io::read_parquet(in_opts);
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_string, result.tbl->view());
 
-  cudf_io::parquet_reader_options default_in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath});
-  result = cudf_io::read_parquet(default_in_opts);
+  cudf::io::parquet_reader_options default_in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath});
+  result = cudf::io::read_parquet(default_in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_string, result.tbl->view());
 
-  std::vector<cudf_io::reader_column_schema> md{
+  std::vector<cudf::io::reader_column_schema> md{
     {},
     {},
     {},
-    cudf_io::reader_column_schema().set_convert_binary_to_strings(false),
-    cudf_io::reader_column_schema().set_convert_binary_to_strings(false)};
+    cudf::io::reader_column_schema().set_convert_binary_to_strings(false),
+    cudf::io::reader_column_schema().set_convert_binary_to_strings(false)};
 
-  cudf_io::parquet_reader_options mixed_in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).set_column_schema(md);
-  result = cudf_io::read_parquet(mixed_in_opts);
+  cudf::io::parquet_reader_options mixed_in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .set_column_schema(md);
+  result = cudf::io::read_parquet(mixed_in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_mixed, result.tbl->view());
 }
@@ -4450,12 +4609,12 @@ TEST_F(ParquetReaderTest, NestedByteArray)
 
   auto seq_col0       = random_values<int>(num_rows);
   auto seq_col2       = random_values<float>(num_rows);
-  auto seq_col3       = random_values<int8_t>(num_rows);
+  auto seq_col3       = random_values<uint8_t>(num_rows);
   auto const validity = cudf::test::iterators::no_nulls();
 
   column_wrapper<int> int_col{seq_col0.begin(), seq_col0.end(), validity};
   column_wrapper<float> float_col{seq_col2.begin(), seq_col2.end(), validity};
-  cudf::test::lists_column_wrapper<int8_t> list_list_int_col{
+  cudf::test::lists_column_wrapper<uint8_t> list_list_int_col{
     {{'M', 'o', 'n', 'd', 'a', 'y'},
      {'W', 'e', 'd', 'n', 'e', 's', 'd', 'a', 'y'},
      {'F', 'r', 'i', 'd', 'a', 'y'}},
@@ -4478,32 +4637,33 @@ TEST_F(ParquetReaderTest, NestedByteArray)
     {{'M', 'o', 'n', 'd', 'a', 'y'}, {'F', 'r', 'i', 'd', 'a', 'y'}}};
 
   auto const expected = table_view{{int_col, float_col, list_list_int_col}};
-  cudf_io::table_input_metadata output_metadata(expected);
+  cudf::io::table_input_metadata output_metadata(expected);
   output_metadata.column_metadata[0].set_name("col_other");
   output_metadata.column_metadata[1].set_name("col_float");
   output_metadata.column_metadata[2].set_name("col_binary").child(1).set_output_as_binary(true);
 
   auto filepath = temp_env->get_temp_filepath("NestedByteArray.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&output_metadata);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
-  auto source = cudf_io::datasource::create(filepath);
-  cudf_io::parquet::FileMetaData fmd;
+  auto source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::FileMetaData fmd;
 
   read_footer(source, &fmd);
   EXPECT_EQ(fmd.schema[5].type, cudf::io::parquet::Type::BYTE_ARRAY);
 
-  std::vector<cudf_io::reader_column_schema> md{
+  std::vector<cudf::io::reader_column_schema> md{
     {},
     {},
-    cudf_io::reader_column_schema().add_child(
-      cudf_io::reader_column_schema().set_convert_binary_to_strings(false))};
+    cudf::io::reader_column_schema().add_child(
+      cudf::io::reader_column_schema().set_convert_binary_to_strings(false))};
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).set_column_schema(md);
-  auto result = cudf_io::read_parquet(in_opts);
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .set_column_schema(md);
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
@@ -4524,23 +4684,23 @@ TEST_F(ParquetWriterTest, ByteArrayStats)
     {0xfe, 0xfe, 0xfe}, {0xfe, 0xfe, 0xfe}, {0xfe, 0xfe, 0xfe}};
 
   auto expected = table_view{{list_int_col0, list_int_col1}};
-  cudf_io::table_input_metadata output_metadata(expected);
+  cudf::io::table_input_metadata output_metadata(expected);
   output_metadata.column_metadata[0].set_name("col_binary0").set_output_as_binary(true);
   output_metadata.column_metadata[1].set_name("col_binary1").set_output_as_binary(true);
 
   auto filepath = temp_env->get_temp_filepath("ByteArrayStats.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&output_metadata);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath})
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
       .set_column_schema({{}, {}});
-  auto result = cudf_io::read_parquet(in_opts);
+  auto result = cudf::io::read_parquet(in_opts);
 
-  auto source = cudf_io::datasource::create(filepath);
-  cudf_io::parquet::FileMetaData fmd;
+  auto source = cudf::io::datasource::create(filepath);
+  cudf::io::parquet::FileMetaData fmd;
 
   read_footer(source, &fmd);
 
@@ -4560,17 +4720,17 @@ TEST_F(ParquetReaderTest, StructByteArray)
 {
   constexpr auto num_rows = 100;
 
-  auto seq_col0       = random_values<int8_t>(num_rows);
+  auto seq_col0       = random_values<uint8_t>(num_rows);
   auto const validity = cudf::test::iterators::no_nulls();
 
-  column_wrapper<int8_t> int_col{seq_col0.begin(), seq_col0.end(), validity};
-  cudf::test::lists_column_wrapper<int8_t> list_of_int{{seq_col0.begin(), seq_col0.begin() + 50},
-                                                       {seq_col0.begin() + 50, seq_col0.end()}};
+  column_wrapper<uint8_t> int_col{seq_col0.begin(), seq_col0.end(), validity};
+  cudf::test::lists_column_wrapper<uint8_t> list_of_int{{seq_col0.begin(), seq_col0.begin() + 50},
+                                                        {seq_col0.begin() + 50, seq_col0.end()}};
   auto struct_col = cudf::test::structs_column_wrapper{{list_of_int}, validity};
 
   auto const expected = table_view{{struct_col}};
   EXPECT_EQ(1, expected.num_columns());
-  cudf_io::table_input_metadata output_metadata(expected);
+  cudf::io::table_input_metadata output_metadata(expected);
   output_metadata.column_metadata[0]
     .set_name("struct_binary")
     .child(0)
@@ -4578,17 +4738,18 @@ TEST_F(ParquetReaderTest, StructByteArray)
     .set_output_as_binary(true);
 
   auto filepath = temp_env->get_temp_filepath("StructByteArray.parquet");
-  cudf_io::parquet_writer_options out_opts =
-    cudf_io::parquet_writer_options::builder(cudf_io::sink_info{filepath}, expected)
+  cudf::io::parquet_writer_options out_opts =
+    cudf::io::parquet_writer_options::builder(cudf::io::sink_info{filepath}, expected)
       .metadata(&output_metadata);
-  cudf_io::write_parquet(out_opts);
+  cudf::io::write_parquet(out_opts);
 
-  std::vector<cudf_io::reader_column_schema> md{cudf_io::reader_column_schema().add_child(
-    cudf_io::reader_column_schema().set_convert_binary_to_strings(false))};
+  std::vector<cudf::io::reader_column_schema> md{cudf::io::reader_column_schema().add_child(
+    cudf::io::reader_column_schema().set_convert_binary_to_strings(false))};
 
-  cudf_io::parquet_reader_options in_opts =
-    cudf_io::parquet_reader_options::builder(cudf_io::source_info{filepath}).set_column_schema(md);
-  auto result = cudf_io::read_parquet(in_opts);
+  cudf::io::parquet_reader_options in_opts =
+    cudf::io::parquet_reader_options::builder(cudf::io::source_info{filepath})
+      .set_column_schema(md);
+  auto result = cudf::io::read_parquet(in_opts);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, result.tbl->view());
 }
diff --git a/cpp/tests/io/text/data_chunk_source_test.cpp b/cpp/tests/io/text/data_chunk_source_test.cpp
index 115a66cdd95..bbace9a5d49 100644
--- a/cpp/tests/io/text/data_chunk_source_test.cpp
+++ b/cpp/tests/io/text/data_chunk_source_test.cpp
@@ -18,6 +18,7 @@
 #include <cudf_test/cudf_gtest.hpp>
 
 #include <cudf/io/text/data_chunk_source_factories.hpp>
+#include <cudf/io/text/detail/bgzip_utils.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -44,7 +45,7 @@ void test_source(const std::string& content, const cudf::io::text::data_chunk_so
   {
     // full contents
     auto reader      = source.create_reader();
-    auto const chunk = reader->get_next_chunk(content.size(), rmm::cuda_stream_default);
+    auto const chunk = reader->get_next_chunk(content.size(), cudf::get_default_stream());
     ASSERT_EQ(chunk->size(), content.size());
     ASSERT_EQ(chunk_to_host(*chunk), content);
   }
@@ -52,15 +53,15 @@ void test_source(const std::string& content, const cudf::io::text::data_chunk_so
     // skipping contents
     auto reader = source.create_reader();
     reader->skip_bytes(4);
-    auto const chunk = reader->get_next_chunk(content.size(), rmm::cuda_stream_default);
+    auto const chunk = reader->get_next_chunk(content.size(), cudf::get_default_stream());
     ASSERT_EQ(chunk->size(), content.size() - 4);
     ASSERT_EQ(chunk_to_host(*chunk), content.substr(4));
   }
   {
     // reading multiple chunks, starting with a small one
     auto reader       = source.create_reader();
-    auto const chunk1 = reader->get_next_chunk(5, rmm::cuda_stream_default);
-    auto const chunk2 = reader->get_next_chunk(content.size() - 5, rmm::cuda_stream_default);
+    auto const chunk1 = reader->get_next_chunk(5, cudf::get_default_stream());
+    auto const chunk2 = reader->get_next_chunk(content.size() - 5, cudf::get_default_stream());
     ASSERT_EQ(chunk1->size(), 5);
     ASSERT_EQ(chunk2->size(), content.size() - 5);
     ASSERT_EQ(chunk_to_host(*chunk1), content.substr(0, 5));
@@ -69,9 +70,9 @@ void test_source(const std::string& content, const cudf::io::text::data_chunk_so
   {
     // reading multiple chunks
     auto reader       = source.create_reader();
-    auto const chunk1 = reader->get_next_chunk(content.size() / 2, rmm::cuda_stream_default);
+    auto const chunk1 = reader->get_next_chunk(content.size() / 2, cudf::get_default_stream());
     auto const chunk2 =
-      reader->get_next_chunk(content.size() - content.size() / 2, rmm::cuda_stream_default);
+      reader->get_next_chunk(content.size() - content.size() / 2, cudf::get_default_stream());
     ASSERT_EQ(chunk1->size(), content.size() / 2);
     ASSERT_EQ(chunk2->size(), content.size() - content.size() / 2);
     ASSERT_EQ(chunk_to_host(*chunk1), content.substr(0, content.size() / 2));
@@ -80,21 +81,50 @@ void test_source(const std::string& content, const cudf::io::text::data_chunk_so
   {
     // reading too many bytes
     auto reader      = source.create_reader();
-    auto const chunk = reader->get_next_chunk(content.size() + 10, rmm::cuda_stream_default);
+    auto const chunk = reader->get_next_chunk(content.size() + 10, cudf::get_default_stream());
     ASSERT_EQ(chunk->size(), content.size());
     ASSERT_EQ(chunk_to_host(*chunk), content);
-    auto next_chunk = reader->get_next_chunk(1, rmm::cuda_stream_default);
+    auto next_chunk = reader->get_next_chunk(1, cudf::get_default_stream());
     ASSERT_EQ(next_chunk->size(), 0);
   }
   {
     // skipping past the end
     auto reader = source.create_reader();
     reader->skip_bytes(content.size() + 10);
-    auto const next_chunk = reader->get_next_chunk(1, rmm::cuda_stream_default);
+    auto const next_chunk = reader->get_next_chunk(1, cudf::get_default_stream());
     ASSERT_EQ(next_chunk->size(), 0);
   }
 }
 
+TEST_F(DataChunkSourceTest, DataSourceHost)
+{
+  std::string const content = "host buffer source";
+  auto const datasource =
+    cudf::io::datasource::create(cudf::io::host_buffer{content.data(), content.size()});
+  auto const source = cudf::io::text::make_source(*datasource);
+
+  test_source(content, *source);
+}
+
+TEST_F(DataChunkSourceTest, DataSourceFile)
+{
+  std::string content = "file datasource";
+  // make it big enought to have is_device_read_preferred return true
+  content.reserve(content.size() << 20);
+  for (int i = 0; i < 20; i++) {
+    content += content;
+  }
+  auto const filename = temp_env->get_temp_filepath("file_source");
+  {
+    std::ofstream file{filename};
+    file << content;
+  }
+  auto const datasource = cudf::io::datasource::create(filename);
+  auto const source     = cudf::io::text::make_source(*datasource);
+
+  test_source(content, *source);
+}
+
 TEST_F(DataChunkSourceTest, Device)
 {
   std::string const content = "device buffer source";
@@ -125,115 +155,86 @@ TEST_F(DataChunkSourceTest, Host)
   test_source(content, *source);
 }
 
-template <typename T>
-void write_int(std::ostream& stream, T val)
-{
-  std::array<char, sizeof(T)> bytes;
-  // we assume little-endian
-  std::memcpy(&bytes[0], &val, sizeof(T));
-  stream.write(bytes.data(), bytes.size());
-}
+enum class compression { ENABLED, DISABLED };
 
-void write_bgzip_block(std::ostream& stream,
-                       const std::string& data,
-                       bool add_extra_garbage_before,
-                       bool add_extra_garbage_after)
+enum class eof { ADD_EOF_BLOCK, NO_EOF_BLOCK };
+
+uint64_t virtual_offset(std::size_t block_offset, std::size_t local_offset)
 {
-  std::array<uint8_t, 10> const header{{
-    31,   // magic number
-    139,  // magic number
-    8,    // compression type: deflate
-    4,    // flags: extra header
-    0,    // mtime
-    0,    // mtime
-    0,    // mtime
-    0,    // mtime: irrelevant
-    4,    // xfl: irrelevant
-    3     // OS: irrelevant
-  }};
-  std::array<char, 4> const extra_blocklen_field{{66, 67, 2, 0}};
-  std::array<char, 11> const extra_garbage_field1{{13,  // magic number
-                                                   37,  // magic number
-                                                   7,   // field length
-                                                   0,   // field length
-                                                   1,
-                                                   2,
-                                                   3,
-                                                   4,
-                                                   5,
-                                                   6,
-                                                   7}};
-  std::array<char, 23> const extra_garbage_field2{{12,  // magic number
-                                                   34,  // magic number
-                                                   2,   // field length
-                                                   0,   // field length
-                                                   1,  2,
-                                                   56,  // magic number
-                                                   78,  // magic number
-                                                   1,   // field length
-                                                   0,   // field length
-                                                   3,   //
-                                                   90,  // magic number
-                                                   12,  // magic number
-                                                   8,   // field length
-                                                   0,   // field length
-                                                   1,  2, 3, 4, 5, 6, 7, 8}};
-  stream.write(reinterpret_cast<const char*>(header.data()), header.size());
-  uint16_t extra_size = extra_blocklen_field.size() + 2;
-  if (add_extra_garbage_before) { extra_size += extra_garbage_field1.size(); }
-  if (add_extra_garbage_after) { extra_size += extra_garbage_field2.size(); }
-  write_int(stream, extra_size);
-  if (add_extra_garbage_before) {
-    stream.write(extra_garbage_field1.data(), extra_garbage_field1.size());
-  }
-  stream.write(extra_blocklen_field.data(), extra_blocklen_field.size());
-  auto const compressed_size          = data.size() + 5;
-  uint16_t const block_size_minus_one = compressed_size + 19 + extra_size;
-  write_int(stream, block_size_minus_one);
-  if (add_extra_garbage_after) {
-    stream.write(extra_garbage_field2.data(), extra_garbage_field2.size());
-  }
-  write_int<uint8_t>(stream, 1);
-  write_int<uint16_t>(stream, data.size());
-  write_int<uint16_t>(stream, ~static_cast<uint16_t>(data.size()));
-  stream.write(data.data(), data.size());
-  // this does not produce a valid file, since we write 0 as the CRC
-  // the parser ignores the checksum, so it doesn't matter to the test
-  // to check output with gzip, plug in the CRC of `data` here.
-  write_int<uint32_t>(stream, 0);
-  write_int<uint32_t>(stream, data.size());
+  return (block_offset << 16) | local_offset;
 }
 
-void write_bgzip(std::ostream& stream,
-                 const std::string& data,
+void write_bgzip(std::ostream& output_stream,
+                 cudf::host_span<const char> data,
                  std::default_random_engine& rng,
-                 bool write_eof = true)
+                 compression compress,
+                 eof add_eof)
 {
+  std::vector<char> const extra_garbage_fields1{{13,  // magic number
+                                                 37,  // magic number
+                                                 7,   // field length
+                                                 0,   // field length
+                                                 1,
+                                                 2,
+                                                 3,
+                                                 4,
+                                                 5,
+                                                 6,
+                                                 7}};
+  std::vector<char> const extra_garbage_fields2{{12,  // magic number
+                                                 34,  // magic number
+                                                 2,   // field length
+                                                 0,   // field length
+                                                 1,  2,
+                                                 56,  // magic number
+                                                 78,  // magic number
+                                                 1,   // field length
+                                                 0,   // field length
+                                                 3,   //
+                                                 90,  // magic number
+                                                 12,  // magic number
+                                                 8,   // field length
+                                                 0,   // field length
+                                                 1,  2, 3, 4, 5, 6, 7, 8}};
   // make sure the block size with header stays below 65536
   std::uniform_int_distribution<std::size_t> block_size_dist{1, 65000};
   auto begin     = data.begin();
   auto const end = data.end();
   int i          = 0;
   while (begin < end) {
+    using cudf::host_span;
     auto len = std::min<std::size_t>(end - begin, block_size_dist(rng));
-    write_bgzip_block(stream, std::string{begin, begin + len}, i & 1, i & 2);
+    host_span<char const> const garbage_before =
+      i & 1 ? extra_garbage_fields1 : host_span<char const>{};
+    host_span<char const> const garbage_after =
+      i & 2 ? extra_garbage_fields2 : host_span<char const>{};
+    if (compress == compression::ENABLED) {
+      cudf::io::text::detail::bgzip::write_compressed_block(
+        output_stream, {begin, len}, garbage_before, garbage_after);
+    } else {
+      cudf::io::text::detail::bgzip::write_uncompressed_block(
+        output_stream, {begin, len}, garbage_before, garbage_after);
+    }
     begin += len;
     i++;
   }
-  if (write_eof) { write_bgzip_block(stream, {}, false, false); }
+  if (add_eof == eof::ADD_EOF_BLOCK) {
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
+  }
 }
 
 TEST_F(DataChunkSourceTest, BgzipSource)
 {
   auto const filename = temp_env->get_temp_filepath("bgzip_source");
   std::string input{"bananarama"};
+  input.reserve(input.size() << 25);
   for (int i = 0; i < 24; i++) {
     input = input + input;
   }
   {
-    std::ofstream stream{filename};
+    std::ofstream output_stream{filename};
     std::default_random_engine rng{};
-    write_bgzip(stream, input, rng);
+    write_bgzip(output_stream, input, rng, compression::DISABLED, eof::ADD_EOF_BLOCK);
   }
 
   auto const source = cudf::io::text::make_source_from_bgzip_file(filename);
@@ -243,15 +244,13 @@ TEST_F(DataChunkSourceTest, BgzipSource)
 
 TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsets)
 {
-  auto const filename = temp_env->get_temp_filepath("bgzip_source");
+  auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets");
   std::string input{"bananarama"};
+  input.reserve(input.size() << 25);
   for (int i = 0; i < 24; i++) {
     input = input + input;
   }
-  std::string padding_garbage{"garbage"};
-  for (int i = 0; i < 10; i++) {
-    padding_garbage = padding_garbage + padding_garbage;
-  }
+  std::string const padding_garbage(10000, 'g');
   std::string const data_garbage{"GARBAGE"};
   std::string const begininput{"begin of bananarama"};
   std::string const endinput{"end of bananarama"};
@@ -260,73 +259,132 @@ TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsets)
   std::size_t const begin_local_offset{data_garbage.size()};
   std::size_t const end_local_offset{endinput.size()};
   {
-    std::ofstream stream{filename};
-    stream.write(padding_garbage.data(), padding_garbage.size());
+    std::ofstream output_stream{filename};
+    output_stream.write(padding_garbage.data(), padding_garbage.size());
     std::default_random_engine rng{};
-    begin_compressed_offset = stream.tellp();
-    write_bgzip_block(stream, data_garbage + begininput, false, false);
-    write_bgzip(stream, input, rng, false);
-    end_compressed_offset = stream.tellp();
-    write_bgzip_block(stream, endinput + data_garbage + data_garbage, false, false);
-    write_bgzip_block(stream, {}, false, false);
-    stream.write(padding_garbage.data(), padding_garbage.size());
+    begin_compressed_offset = output_stream.tellp();
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream,
+                                                            data_garbage + begininput);
+    write_bgzip(output_stream, input, rng, compression::DISABLED, eof::NO_EOF_BLOCK);
+    end_compressed_offset = output_stream.tellp();
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream,
+                                                            endinput + data_garbage + data_garbage);
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
+    output_stream.write(padding_garbage.data(), padding_garbage.size());
   }
   input = begininput + input + endinput;
 
-  auto const source =
-    cudf::io::text::make_source_from_bgzip_file(filename,
-                                                begin_compressed_offset << 16 | begin_local_offset,
-                                                end_compressed_offset << 16 | end_local_offset);
+  auto const source = cudf::io::text::make_source_from_bgzip_file(
+    filename,
+    virtual_offset(begin_compressed_offset, begin_local_offset),
+    virtual_offset(end_compressed_offset, end_local_offset));
 
   test_source(input, *source);
 }
 
 TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleGZipBlock)
 {
-  auto const filename = temp_env->get_temp_filepath("bgzip_source");
+  auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets_single_block");
   std::string const input{"collection unit brings"};
   std::string const head_garbage{"garbage"};
   std::string const tail_garbage{"GARBAGE"};
-  std::size_t begin_compressed_offset{};
-  std::size_t end_compressed_offset{};
   std::size_t const begin_local_offset{head_garbage.size()};
   std::size_t const end_local_offset{head_garbage.size() + input.size()};
   {
-    std::ofstream stream{filename};
-    write_bgzip_block(stream, head_garbage + input + tail_garbage, false, false);
-    write_bgzip_block(stream, {}, false, false);
+    std::ofstream output_stream{filename};
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream,
+                                                            head_garbage + input + tail_garbage);
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
   }
 
-  auto const source =
-    cudf::io::text::make_source_from_bgzip_file(filename,
-                                                begin_compressed_offset << 16 | begin_local_offset,
-                                                end_compressed_offset << 16 | end_local_offset);
+  auto const source = cudf::io::text::make_source_from_bgzip_file(
+    filename, virtual_offset(0, begin_local_offset), virtual_offset(0, end_local_offset));
 
   test_source(input, *source);
 }
 
 TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleChunk)
 {
-  auto const filename = temp_env->get_temp_filepath("bgzip_source");
+  auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets_single_chunk");
   std::string const input{"collection unit brings"};
   std::string const head_garbage{"garbage"};
   std::string const tail_garbage{"GARBAGE"};
-  std::size_t begin_compressed_offset{};
   std::size_t end_compressed_offset{};
   std::size_t const begin_local_offset{head_garbage.size()};
   std::size_t const end_local_offset{input.size() - 10};
   {
-    std::ofstream stream{filename};
-    write_bgzip_block(stream, head_garbage + input.substr(0, 10), false, false);
-    end_compressed_offset = stream.tellp();
-    write_bgzip_block(stream, input.substr(10) + tail_garbage, false, false);
-    write_bgzip_block(stream, {}, false, false);
+    std::ofstream output_stream{filename};
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream,
+                                                            head_garbage + input.substr(0, 10));
+    end_compressed_offset = output_stream.tellp();
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream,
+                                                            input.substr(10) + tail_garbage);
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
+  }
+
+  auto const source = cudf::io::text::make_source_from_bgzip_file(
+    filename,
+    virtual_offset(0, begin_local_offset),
+    virtual_offset(end_compressed_offset, end_local_offset));
+
+  test_source(input, *source);
+}
+
+TEST_F(DataChunkSourceTest, BgzipCompressedSourceVirtualOffsets)
+{
+  auto const filename = temp_env->get_temp_filepath("bgzip_source_compressed_offsets");
+  std::string input{"bananarama"};
+  input.reserve(input.size() << 25);
+  for (int i = 0; i < 24; i++) {
+    input = input + input;
+  }
+  std::string const padding_garbage(10000, 'g');
+  std::string const data_garbage{"GARBAGE"};
+  std::string const begininput{"begin of bananarama"};
+  std::string const endinput{"end of bananarama"};
+  std::size_t begin_compressed_offset{};
+  std::size_t end_compressed_offset{};
+  std::size_t const begin_local_offset{data_garbage.size()};
+  std::size_t const end_local_offset{endinput.size()};
+  {
+    std::ofstream output_stream{filename};
+    output_stream.write(padding_garbage.data(), padding_garbage.size());
+    std::default_random_engine rng{};
+    begin_compressed_offset = output_stream.tellp();
+    cudf::io::text::detail::bgzip::write_compressed_block(output_stream, data_garbage + begininput);
+    write_bgzip(output_stream, input, rng, compression::ENABLED, eof::NO_EOF_BLOCK);
+    end_compressed_offset = output_stream.tellp();
+    cudf::io::text::detail::bgzip::write_compressed_block(output_stream,
+                                                          endinput + data_garbage + data_garbage);
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
+    output_stream.write(padding_garbage.data(), padding_garbage.size());
+  }
+  input = begininput + input + endinput;
+
+  auto source = cudf::io::text::make_source_from_bgzip_file(
+    filename,
+    virtual_offset(begin_compressed_offset, begin_local_offset),
+    virtual_offset(end_compressed_offset, end_local_offset));
+  test_source(input, *source);
+}
+
+TEST_F(DataChunkSourceTest, BgzipSourceVirtualOffsetsSingleCompressedGZipBlock)
+{
+  auto const filename = temp_env->get_temp_filepath("bgzip_source_offsets_single_compressed_block");
+  std::string const input{"collection unit brings"};
+  std::string const head_garbage(10000, 'g');
+  std::string const tail_garbage{"GARBAGE"};
+  std::size_t const begin_local_offset{head_garbage.size()};
+  std::size_t const end_local_offset{head_garbage.size() + input.size()};
+  {
+    std::ofstream output_stream{filename};
+    cudf::io::text::detail::bgzip::write_compressed_block(output_stream,
+                                                          head_garbage + input + tail_garbage);
+    cudf::io::text::detail::bgzip::write_uncompressed_block(output_stream, {});
   }
 
-  auto const source =
-    cudf::io::text::make_source_from_bgzip_file(filename,
-                                                begin_compressed_offset << 16 | begin_local_offset,
-                                                end_compressed_offset << 16 | end_local_offset);
+  auto const source = cudf::io::text::make_source_from_bgzip_file(
+    filename, virtual_offset(0, begin_local_offset), virtual_offset(0, end_local_offset));
 
   test_source(input, *source);
 }
diff --git a/cpp/tests/io/text/multibyte_split_test.cpp b/cpp/tests/io/text/multibyte_split_test.cpp
index 43debf3d5b3..2783b006982 100644
--- a/cpp/tests/io/text/multibyte_split_test.cpp
+++ b/cpp/tests/io/text/multibyte_split_test.cpp
@@ -14,6 +14,8 @@
  * limitations under the License.
  */
 
+#include <io/utilities/output_builder.cuh>
+
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -26,6 +28,7 @@
 #include <cudf/io/text/data_chunk_source_factories.hpp>
 #include <cudf/io/text/multibyte_split.hpp>
 #include <cudf/strings/strings_column_view.hpp>
+#include <cudf/utilities/default_stream.hpp>
 
 using namespace cudf;
 using namespace test;
@@ -62,12 +65,25 @@ TEST_F(MultibyteSplitTest, NondeterministicMatching)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
 
+TEST_F(MultibyteSplitTest, NoDelimiter)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg");
+
+  auto expected = strings_column_wrapper{"abcdefg"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(*source, delimiter);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, DelimiterAtEnd)
 {
   auto delimiter  = std::string(":");
   auto host_input = std::string("abcdefg:");
 
-  auto expected = strings_column_wrapper{"abcdefg:", ""};
+  auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(*source, delimiter);
@@ -80,7 +96,7 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   auto delimiter  = std::string(":");
   auto host_input = std::string("abcdefg:");
 
-  auto expected = strings_column_wrapper{"abcdefg:", ""};
+  auto expected = strings_column_wrapper{"abcdefg:"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(
@@ -91,6 +107,22 @@ TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
 }
 
+TEST_F(MultibyteSplitTest, DelimiterAtEndByteRange2)
+{
+  auto delimiter  = std::string(":");
+  auto host_input = std::string("abcdefg:");
+
+  auto expected = strings_column_wrapper{"abcdefg:"};
+
+  auto source = cudf::io::text::make_source(host_input);
+  auto out    = cudf::io::text::multibyte_split(
+    *source,
+    delimiter,
+    cudf::io::text::byte_range_info{0, static_cast<int64_t>(host_input.size() - 1)});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, LargeInputSparse)
 {
   auto host_input    = std::string(1024 * 1024 * 32, '.');
@@ -120,8 +152,6 @@ TEST_F(MultibyteSplitTest, LargeInput)
     host_expected.emplace_back(std::string("...:|"));
   }
 
-  host_expected.emplace_back(std::string(""));
-
   auto expected = strings_column_wrapper{host_expected.begin(), host_expected.end()};
 
   auto delimiter = std::string("...:|");
@@ -146,6 +176,52 @@ TEST_F(MultibyteSplitTest, OverlappingMatchErasure)
   // CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out); // this use case it not yet supported.
 }
 
+TEST_F(MultibyteSplitTest, DelimiterErasure)
+{
+  auto delimiter = "\r\n";
+
+  auto host_input = std::string("line\r\nanother line\r\nthird line\r\n");
+  auto expected   = strings_column_wrapper{"line", "another line", "third line"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, DelimiterErasureByteRange)
+{
+  auto delimiter = "\r\n";
+
+  auto host_input = std::string("line\r\nanother line\r\nthird line\r\n");
+  auto expected   = strings_column_wrapper{"line", "another line", "third line"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  options.byte_range       = cudf::io::text::byte_range_info(0, host_input.size() - 1);
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
+TEST_F(MultibyteSplitTest, DelimiterErasureOverlap)
+{
+  auto delimiter = "::";
+
+  auto host_input = std::string("::a:::b::c::::d");
+  auto expected   = strings_column_wrapper{"", "a", "", "b", "c", "", "", "d"};
+
+  cudf::io::text::parse_options options;
+  options.strip_delimiters = true;
+  auto source              = cudf::io::text::make_source(host_input);
+  auto out                 = cudf::io::text::multibyte_split(*source, delimiter, options);
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out);
+}
+
 TEST_F(MultibyteSplitTest, HandpickedInput)
 {
   auto delimiters = "::|";
@@ -184,7 +260,7 @@ TEST_F(MultibyteSplitTest, HandpickedInput)
     "ggg::|",         "hhh::|",      "___::|",       "here::|", "is::|",     "another::|",
     "simple::|",      "text::|",     "seperated::|", "by::|",   "emojis::|", "which::|",
     "are::|",         "multiple::|", "bytes::|",     "and::|",  "used::|",   "as::|",
-    "delimiters.::|", "::|",         "::|",          "::|",     ""};
+    "delimiters.::|", "::|",         "::|",          "::|"};
 
   auto source = cudf::io::text::make_source(host_input);
   auto out    = cudf::io::text::multibyte_split(*source, delimiters);
@@ -359,6 +435,21 @@ TEST_F(MultibyteSplitTest, SmallInputAllPossibleRangesSingleByte)
   }
 }
 
+TEST_F(MultibyteSplitTest, SingletonRangeAtEnd)
+{
+  // we want a delimiter at the end of the file to not create a new empty row even if it is the only
+  // character in the byte range
+  using namespace cudf::io::text;
+  auto host_input = std::string("ab:cd:");
+  auto delimiter  = std::string(":");
+  auto source     = make_source(host_input);
+  auto expected   = strings_column_wrapper{};
+
+  auto out = multibyte_split(*source, delimiter, byte_range_info{5, 1});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS);
+}
+
 TEST_F(MultibyteSplitTest, EmptyInput)
 {
   using namespace cudf::io::text;
@@ -411,4 +502,65 @@ TEST_F(MultibyteSplitTest, EmptyRangeSingleByte)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, *out, debug_output_level::ALL_ERRORS);
 }
 
+TEST_F(MultibyteSplitTest, EmptySplitDeviceSpan)
+{
+  cudf::split_device_span<int> span;
+  ASSERT_EQ(span.size(), 0);
+  ASSERT_EQ(span.head().size(), 0);
+  ASSERT_EQ(span.head().data(), nullptr);
+  ASSERT_EQ(span.tail().size(), 0);
+  ASSERT_EQ(span.tail().data(), nullptr);
+}
+
+TEST_F(MultibyteSplitTest, SplitDeviceSpan)
+{
+  int i = 0;
+  int j = 1;
+  cudf::split_device_span<int> span{{&i, 1}, {&j, 1}};
+  ASSERT_EQ(span.size(), 2);
+  ASSERT_EQ(span.head().size(), 1);
+  ASSERT_EQ(span.head().data(), &i);
+  ASSERT_EQ(span.tail().size(), 1);
+  ASSERT_EQ(span.tail().data(), &j);
+  ASSERT_EQ(&span[0], &i);
+  ASSERT_EQ(&span[1], &j);
+  ASSERT_EQ(&*span.begin(), &i);
+  ASSERT_EQ(&*(span.begin() + 1), &j);
+  ASSERT_NE(span.begin() + 1, span.end());
+  ASSERT_EQ(span.begin() + 2, span.end());
+}
+
+TEST_F(MultibyteSplitTest, OutputBuilder)
+{
+  auto const stream = cudf::get_default_stream();
+  cudf::output_builder<char> builder{10, 4, stream};
+  auto const output = builder.next_output(stream);
+  ASSERT_GE(output.size(), 10);
+  ASSERT_EQ(output.tail().size(), 0);
+  ASSERT_EQ(output.tail().data(), nullptr);
+  ASSERT_EQ(builder.size(), 0);
+  builder.advance_output(1, stream);
+  ASSERT_EQ(builder.size(), 1);
+  auto const output2 = builder.next_output(stream);
+  ASSERT_EQ(output2.head().data(), output.head().data() + 1);
+  builder.advance_output(10, stream);
+  ASSERT_EQ(builder.size(), 11);
+  auto const output3 = builder.next_output(stream);
+  ASSERT_EQ(output3.head().size(), 9);
+  ASSERT_EQ(output3.head().data(), output.head().data() + 11);
+  ASSERT_EQ(output3.tail().size(), 40);
+  builder.advance_output(9, stream);
+  ASSERT_EQ(builder.size(), 20);
+  auto const output4 = builder.next_output(stream);
+  ASSERT_EQ(output4.head().size(), 0);
+  ASSERT_EQ(output4.tail().size(), output3.tail().size());
+  ASSERT_EQ(output4.tail().data(), output3.tail().data());
+  builder.advance_output(1, stream);
+  auto const output5 = builder.next_output(stream);
+  ASSERT_EQ(output5.head().size(), 39);
+  ASSERT_EQ(output5.head().data(), output4.tail().data() + 1);
+  ASSERT_EQ(output5.tail().size(), 0);
+  ASSERT_EQ(output5.tail().data(), nullptr);
+}
+
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/io/type_inference_test.cu b/cpp/tests/io/type_inference_test.cu
index 04bb7507934..cbaa06589cf 100644
--- a/cpp/tests/io/type_inference_test.cu
+++ b/cpp/tests/io/type_inference_test.cu
@@ -17,6 +17,7 @@
 #include <io/utilities/trie.cuh>
 #include <io/utilities/type_inference.cuh>
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf_test/base_fixture.hpp>
 
@@ -39,7 +40,7 @@ struct TypeInference : public cudf::test::BaseFixture {
 
 TEST_F(TypeInference, Basic)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   auto options       = parse_options{',', '\n', '\"'};
   options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
@@ -50,20 +51,21 @@ TEST_F(TypeInference, Basic)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  std::size_t constexpr size = 3;
-  auto const string_offset   = std::vector<int32_t>{1, 4, 7};
-  auto const string_length   = std::vector<std::size_t>{2, 2, 1};
-  rmm::device_vector<int32_t> d_string_offset{string_offset};
-  rmm::device_vector<std::size_t> d_string_length{string_length};
+  auto const string_offset = std::vector<int32_t>{1, 4, 7};
+  auto const string_length = std::vector<std::size_t>{2, 2, 1};
+  auto const d_string_offset =
+    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
+  auto const d_string_length =
+    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
 
   auto d_col_strings =
-    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+    thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
 
   auto res_type =
     infer_data_type(options.json_view(),
                     {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
                     d_col_strings,
-                    size,
+                    string_offset.size(),
                     stream);
 
   EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT64});
@@ -71,7 +73,7 @@ TEST_F(TypeInference, Basic)
 
 TEST_F(TypeInference, Null)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   auto options       = parse_options{',', '\n', '\"'};
   options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
@@ -82,20 +84,21 @@ TEST_F(TypeInference, Null)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  std::size_t constexpr size = 3;
-  auto const string_offset   = std::vector<int32_t>{1, 1, 4};
-  auto const string_length   = std::vector<std::size_t>{0, 2, 1};
-  rmm::device_vector<int32_t> d_string_offset{string_offset};
-  rmm::device_vector<std::size_t> d_string_length{string_length};
+  auto const string_offset = std::vector<int32_t>{1, 1, 4};
+  auto const string_length = std::vector<std::size_t>{0, 2, 1};
+  auto const d_string_offset =
+    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
+  auto const d_string_length =
+    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
 
   auto d_col_strings =
-    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+    thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
 
   auto res_type =
     infer_data_type(options.json_view(),
                     {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
                     d_col_strings,
-                    size,
+                    string_offset.size(),
                     stream);
 
   EXPECT_EQ(res_type,
@@ -104,7 +107,7 @@ TEST_F(TypeInference, Null)
 
 TEST_F(TypeInference, AllNull)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   auto options       = parse_options{',', '\n', '\"'};
   options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
@@ -115,20 +118,21 @@ TEST_F(TypeInference, AllNull)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  std::size_t constexpr size = 3;
-  auto const string_offset   = std::vector<int32_t>{1, 1, 1};
-  auto const string_length   = std::vector<std::size_t>{0, 0, 4};
-  rmm::device_vector<int32_t> d_string_offset{string_offset};
-  rmm::device_vector<std::size_t> d_string_length{string_length};
+  auto const string_offset = std::vector<int32_t>{1, 1, 1};
+  auto const string_length = std::vector<std::size_t>{0, 0, 4};
+  auto const d_string_offset =
+    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
+  auto const d_string_length =
+    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
 
   auto d_col_strings =
-    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+    thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
 
   auto res_type =
     infer_data_type(options.json_view(),
                     {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
                     d_col_strings,
-                    size,
+                    string_offset.size(),
                     stream);
 
   EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::INT8});  // INT8 if all nulls
@@ -136,7 +140,7 @@ TEST_F(TypeInference, AllNull)
 
 TEST_F(TypeInference, String)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   auto options       = parse_options{',', '\n', '\"'};
   options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
@@ -147,20 +151,21 @@ TEST_F(TypeInference, String)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  std::size_t constexpr size = 3;
-  auto const string_offset   = std::vector<int32_t>{1, 8, 12};
-  auto const string_length   = std::vector<std::size_t>{6, 3, 4};
-  rmm::device_vector<int32_t> d_string_offset{string_offset};
-  rmm::device_vector<std::size_t> d_string_length{string_length};
+  auto const string_offset = std::vector<int32_t>{1, 8, 12};
+  auto const string_length = std::vector<std::size_t>{6, 3, 4};
+  auto const d_string_offset =
+    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
+  auto const d_string_length =
+    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
 
   auto d_col_strings =
-    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+    thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
 
   auto res_type =
     infer_data_type(options.json_view(),
                     {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
                     d_col_strings,
-                    size,
+                    string_offset.size(),
                     stream);
 
   EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING});
@@ -168,7 +173,7 @@ TEST_F(TypeInference, String)
 
 TEST_F(TypeInference, Bool)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   auto options       = parse_options{',', '\n', '\"'};
   options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
@@ -179,20 +184,21 @@ TEST_F(TypeInference, Bool)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  std::size_t constexpr size = 3;
-  auto const string_offset   = std::vector<int32_t>{1, 6, 12};
-  auto const string_length   = std::vector<std::size_t>{4, 5, 5};
-  rmm::device_vector<int32_t> d_string_offset{string_offset};
-  rmm::device_vector<std::size_t> d_string_length{string_length};
+  auto const string_offset = std::vector<int32_t>{1, 6, 12};
+  auto const string_length = std::vector<std::size_t>{4, 5, 5};
+  auto const d_string_offset =
+    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
+  auto const d_string_length =
+    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
 
   auto d_col_strings =
-    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+    thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
 
   auto res_type =
     infer_data_type(options.json_view(),
                     {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
                     d_col_strings,
-                    size,
+                    string_offset.size(),
                     stream);
 
   EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::BOOL8});
@@ -200,7 +206,7 @@ TEST_F(TypeInference, Bool)
 
 TEST_F(TypeInference, Timestamp)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   auto options       = parse_options{',', '\n', '\"'};
   options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
@@ -211,20 +217,21 @@ TEST_F(TypeInference, Timestamp)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  std::size_t constexpr size = 3;
-  auto const string_offset   = std::vector<int32_t>{1, 10};
-  auto const string_length   = std::vector<std::size_t>{8, 9};
-  rmm::device_vector<int32_t> d_string_offset{string_offset};
-  rmm::device_vector<std::size_t> d_string_length{string_length};
+  auto const string_offset = std::vector<int32_t>{1, 10};
+  auto const string_length = std::vector<std::size_t>{8, 9};
+  auto const d_string_offset =
+    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
+  auto const d_string_length =
+    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
 
   auto d_col_strings =
-    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+    thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
 
   auto res_type =
     infer_data_type(options.json_view(),
                     {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
                     d_col_strings,
-                    size,
+                    string_offset.size(),
                     stream);
 
   // All data time (quoted and unquoted) is inferred as string for now
@@ -233,7 +240,7 @@ TEST_F(TypeInference, Timestamp)
 
 TEST_F(TypeInference, InvalidInput)
 {
-  auto const stream = cudf::default_stream_value;
+  auto const stream = cudf::get_default_stream();
 
   auto options       = parse_options{',', '\n', '\"'};
   options.trie_true  = cudf::detail::create_serialized_trie({"true"}, stream);
@@ -244,22 +251,25 @@ TEST_F(TypeInference, InvalidInput)
   auto d_data           = cudf::make_string_scalar(data);
   auto& d_string_scalar = static_cast<cudf::string_scalar&>(*d_data);
 
-  std::size_t constexpr size = 5;
-  auto const string_offset   = std::vector<int32_t>{1, 3, 5, 7, 9};
-  auto const string_length   = std::vector<std::size_t>{1, 1, 1, 1, 1};
-  rmm::device_vector<int32_t> d_string_offset{string_offset};
-  rmm::device_vector<std::size_t> d_string_length{string_length};
+  auto const string_offset = std::vector<int32_t>{1, 3, 5, 7, 9};
+  auto const string_length = std::vector<std::size_t>{1, 1, 1, 1, 1};
+  auto const d_string_offset =
+    cudf::detail::make_device_uvector_async(string_offset, cudf::get_default_stream());
+  auto const d_string_length =
+    cudf::detail::make_device_uvector_async(string_length, cudf::get_default_stream());
 
   auto d_col_strings =
-    thrust::make_zip_iterator(make_tuple(d_string_offset.begin(), d_string_length.begin()));
+    thrust::make_zip_iterator(thrust::make_tuple(d_string_offset.begin(), d_string_length.begin()));
 
   auto res_type =
     infer_data_type(options.json_view(),
                     {d_string_scalar.data(), static_cast<std::size_t>(d_string_scalar.size())},
                     d_col_strings,
-                    size,
+                    string_offset.size(),
                     stream);
 
   // Invalid input is inferred as string for now
   EXPECT_EQ(res_type, cudf::data_type{cudf::type_id::STRING});
 }
+
+CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/iterator/iterator_tests.cuh b/cpp/tests/iterator/iterator_tests.cuh
index 26902b43662..894e117ba40 100644
--- a/cpp/tests/iterator/iterator_tests.cuh
+++ b/cpp/tests/iterator/iterator_tests.cuh
@@ -50,7 +50,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
   void iterator_test_cub(T_output expected, InputIterator d_in, int num_items)
   {
     T_output init = cudf::test::make_type_param_scalar<T_output>(0);
-    rmm::device_uvector<T_output> dev_result(1, cudf::default_stream_value);
+    rmm::device_uvector<T_output> dev_result(1, cudf::get_default_stream());
 
     // Get temporary storage size
     size_t temp_storage_bytes = 0;
@@ -61,10 +61,10 @@ struct IteratorTest : public cudf::test::BaseFixture {
                               num_items,
                               thrust::minimum{},
                               init,
-                              cudf::default_stream_value.value());
+                              cudf::get_default_stream().value());
 
     // Allocate temporary storage
-    rmm::device_buffer d_temp_storage(temp_storage_bytes, cudf::default_stream_value);
+    rmm::device_buffer d_temp_storage(temp_storage_bytes, cudf::get_default_stream());
 
     // Run reduction
     cub::DeviceReduce::Reduce(d_temp_storage.data(),
@@ -74,7 +74,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                               num_items,
                               thrust::minimum{},
                               init,
-                              cudf::default_stream_value.value());
+                              cudf::get_default_stream().value());
 
     evaluate(expected, dev_result, "cub test");
   }
@@ -87,18 +87,19 @@ struct IteratorTest : public cudf::test::BaseFixture {
   {
     InputIterator d_in_last = d_in + num_items;
     EXPECT_EQ(thrust::distance(d_in, d_in_last), num_items);
-    auto dev_expected = cudf::detail::make_device_uvector_sync(expected);
+    auto dev_expected =
+      cudf::detail::make_device_uvector_sync(expected, cudf::get_default_stream());
 
     // using a temporary vector and calling transform and all_of separately is
     // equivalent to thrust::equal but compiles ~3x faster
-    auto dev_results = rmm::device_uvector<bool>(num_items, cudf::default_stream_value);
-    thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+    auto dev_results = rmm::device_uvector<bool>(num_items, cudf::get_default_stream());
+    thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                       d_in,
                       d_in_last,
                       dev_expected.begin(),
                       dev_results.begin(),
                       thrust::equal_to{});
-    auto result = thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+    auto result = thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                                  dev_results.begin(),
                                  dev_results.end(),
                                  thrust::identity<bool>{});
@@ -110,7 +111,7 @@ struct IteratorTest : public cudf::test::BaseFixture {
                 rmm::device_uvector<T_output> const& dev_result,
                 const char* msg = nullptr)
   {
-    auto host_result = cudf::detail::make_host_vector_sync(dev_result);
+    auto host_result = cudf::detail::make_host_vector_sync(dev_result, cudf::get_default_stream());
 
     EXPECT_EQ(expected, host_result[0]) << msg;
   }
diff --git a/cpp/tests/iterator/optional_iterator_test_numeric.cu b/cpp/tests/iterator/optional_iterator_test_numeric.cu
index c5b7393550a..586c9472185 100644
--- a/cpp/tests/iterator/optional_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/optional_iterator_test_numeric.cu
@@ -111,14 +111,14 @@ TYPED_TEST(NumericOptionalIteratorTest, mean_var_output)
 
   // this can be computed with a single reduce and without a temporary output vector
   // but the approach increases the compile time by ~2x
-  auto results = rmm::device_uvector<T_output>(d_col->size(), cudf::default_stream_value);
-  thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+  auto results = rmm::device_uvector<T_output>(d_col->size(), cudf::get_default_stream());
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     it_dev_squared,
                     it_dev_squared + d_col->size(),
                     results.begin(),
                     optional_to_meanvar<T_output>{});
   auto result = thrust::reduce(
-    rmm::exec_policy(cudf::default_stream_value), results.begin(), results.end(), T_output{});
+    rmm::exec_policy(cudf::get_default_stream()), results.begin(), results.end(), T_output{});
 
   if (not std::is_floating_point<T>()) {
     EXPECT_EQ(expected_value, result) << "optional iterator reduction sum";
diff --git a/cpp/tests/iterator/pair_iterator_test_numeric.cu b/cpp/tests/iterator/pair_iterator_test_numeric.cu
index f570df44286..99ec3118b4b 100644
--- a/cpp/tests/iterator/pair_iterator_test_numeric.cu
+++ b/cpp/tests/iterator/pair_iterator_test_numeric.cu
@@ -113,7 +113,7 @@ TYPED_TEST(NumericPairIteratorTest, mean_var_output)
   // GPU test
   auto it_dev         = d_col->pair_begin<T, true>();
   auto it_dev_squared = thrust::make_transform_iterator(it_dev, transformer);
-  auto result         = thrust::reduce(rmm::exec_policy(cudf::default_stream_value),
+  auto result         = thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
                                it_dev_squared,
                                it_dev_squared + d_col->size(),
                                thrust::make_pair(T_output{}, true),
diff --git a/cpp/tests/iterator/value_iterator_test.cuh b/cpp/tests/iterator/value_iterator_test.cuh
index d68ec4e1471..fa931d34a0e 100644
--- a/cpp/tests/iterator/value_iterator_test.cuh
+++ b/cpp/tests/iterator/value_iterator_test.cuh
@@ -25,7 +25,7 @@ template <typename T>
 void non_null_iterator(IteratorTest<T>& testFixture)
 {
   auto host_array = cudf::test::make_type_param_vector<T>({0, 6, 0, -14, 13, 64, -13, -20, 45});
-  auto dev_array  = cudf::detail::make_device_uvector_sync(host_array);
+  auto dev_array  = cudf::detail::make_device_uvector_sync(host_array, cudf::get_default_stream());
 
   // calculate the expected value by CPU.
   thrust::host_vector<T> replaced_array(host_array);
diff --git a/cpp/tests/iterator/value_iterator_test_strings.cu b/cpp/tests/iterator/value_iterator_test_strings.cu
index 9aa18eb844f..8b4080fa493 100644
--- a/cpp/tests/iterator/value_iterator_test_strings.cu
+++ b/cpp/tests/iterator/value_iterator_test_strings.cu
@@ -30,7 +30,7 @@ auto strings_to_string_views(std::vector<std::string>& input_strings)
   std::vector<int32_t> offsets;
   std::tie(chars, offsets) = cudf::test::detail::make_chars_and_offsets(
     input_strings.begin(), input_strings.end(), all_valid);
-  auto dev_chars = cudf::detail::make_device_uvector_sync(chars);
+  auto dev_chars = cudf::detail::make_device_uvector_sync(chars, cudf::get_default_stream());
 
   // calculate the expected value by CPU. (but contains device pointers)
   thrust::host_vector<cudf::string_view> replaced_array(input_strings.size());
@@ -51,7 +51,7 @@ TEST_F(StringIteratorTest, string_view_null_iterator)
   using T = cudf::string_view;
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero);
+  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
   T init       = T{initmsg.data(), int(initmsg.size())};
 
   // data and valid arrays
@@ -86,7 +86,7 @@ TEST_F(StringIteratorTest, string_view_no_null_iterator)
   // T init = T{"", 0};
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero);
+  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
   T init       = T{initmsg.data(), int(initmsg.size())};
 
   // data array
@@ -110,7 +110,7 @@ TEST_F(StringIteratorTest, string_scalar_iterator)
   // T init = T{"", 0};
   std::string zero("zero");
   // the char data has to be in GPU
-  auto initmsg = cudf::detail::make_device_uvector_sync(zero);
+  auto initmsg = cudf::detail::make_device_uvector_sync(zero, cudf::get_default_stream());
   T init       = T{initmsg.data(), int(initmsg.size())};
 
   // data array
diff --git a/cpp/tests/join/conditional_join_tests.cu b/cpp/tests/join/conditional_join_tests.cu
index bc2a96b5adf..920c497f850 100644
--- a/cpp/tests/join/conditional_join_tests.cu
+++ b/cpp/tests/join/conditional_join_tests.cu
@@ -26,10 +26,8 @@
 
 #include <rmm/exec_policy.hpp>
 
-#include <thrust/device_vector.h>
 #include <thrust/equal.h>
-#include <thrust/execution_policy.h>
-#include <thrust/pair.h>
+#include <thrust/iterator/counting_iterator.h>
 #include <thrust/sort.h>
 #include <thrust/transform.h>
 
@@ -127,6 +125,30 @@ gen_random_nullable_repeated_columns(unsigned int N = 10000, unsigned int num_re
                    std::pair(std::move(right), std::move(right_nulls)));
 }
 
+// `rmm::device_uvector<T>` requires that T be trivially copyable. `thrust::pair` does
+// not satisfy this requirement because it defines nontrivial copy/move
+// constructors. Therefore, we need a simple, trivially copyable pair-like
+// object. `index_pair` is a minimal implementation suitable for use in the
+// tests in this file.
+struct index_pair {
+  cudf::size_type first{};
+  cudf::size_type second{};
+  __device__ index_pair(){};
+  __device__ index_pair(cudf::size_type const& first, cudf::size_type const& second)
+    : first(first), second(second){};
+};
+
+__device__ inline bool operator<(const index_pair& lhs, const index_pair& rhs)
+{
+  if (lhs.first > rhs.first) return false;
+  return (lhs.first < rhs.first) || (lhs.second < rhs.second);
+}
+
+__device__ inline bool operator==(const index_pair& lhs, const index_pair& rhs)
+{
+  return lhs.first == rhs.first && lhs.second == rhs.second;
+}
+
 }  // namespace
 
 /**
@@ -208,8 +230,8 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
       // Note: Not trying to be terribly efficient here since these tests are
       // small, otherwise a batch copy to host before constructing the tuples
       // would be important.
-      result_pairs.push_back({result.first->element(i, cudf::default_stream_value),
-                              result.second->element(i, cudf::default_stream_value)});
+      result_pairs.push_back({result.first->element(i, cudf::get_default_stream()),
+                              result.second->element(i, cudf::get_default_stream())});
     }
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
@@ -253,34 +275,34 @@ struct ConditionalJoinPairReturnTest : public ConditionalJoinTest<T> {
    */
   void _compare_to_hash_join(PairJoinReturn const& result, PairJoinReturn const& reference)
   {
-    thrust::device_vector<thrust::pair<cudf::size_type, cudf::size_type>> result_pairs(
-      result.first->size());
-    thrust::device_vector<thrust::pair<cudf::size_type, cudf::size_type>> reference_pairs(
-      reference.first->size());
+    auto result_pairs =
+      rmm::device_uvector<index_pair>(result.first->size(), cudf::get_default_stream());
+    auto reference_pairs =
+      rmm::device_uvector<index_pair>(reference.first->size(), cudf::get_default_stream());
 
-    thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+    thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                       result.first->begin(),
                       result.first->end(),
                       result.second->begin(),
                       result_pairs.begin(),
                       [] __device__(cudf::size_type first, cudf::size_type second) {
-                        return thrust::make_pair(first, second);
+                        return index_pair{first, second};
                       });
-    thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+    thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                       reference.first->begin(),
                       reference.first->end(),
                       reference.second->begin(),
                       reference_pairs.begin(),
                       [] __device__(cudf::size_type first, cudf::size_type second) {
-                        return thrust::make_pair(first, second);
+                        return index_pair{first, second};
                       });
 
     thrust::sort(
-      rmm::exec_policy(cudf::default_stream_value), result_pairs.begin(), result_pairs.end());
+      rmm::exec_policy(cudf::get_default_stream()), result_pairs.begin(), result_pairs.end());
     thrust::sort(
-      rmm::exec_policy(cudf::default_stream_value), reference_pairs.begin(), reference_pairs.end());
+      rmm::exec_policy(cudf::get_default_stream()), reference_pairs.begin(), reference_pairs.end());
 
-    EXPECT_TRUE(thrust::equal(rmm::exec_policy(cudf::default_stream_value),
+    EXPECT_TRUE(thrust::equal(rmm::exec_policy(cudf::get_default_stream()),
                               reference_pairs.begin(),
                               reference_pairs.end(),
                               result_pairs.begin()));
@@ -691,7 +713,7 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
       // Note: Not trying to be terribly efficient here since these tests are
       // small, otherwise a batch copy to host before constructing the tuples
       // would be important.
-      resulting_indices.push_back(result->element(i, cudf::default_stream_value));
+      resulting_indices.push_back(result->element(i, cudf::get_default_stream()));
     }
     std::sort(resulting_indices.begin(), resulting_indices.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
@@ -702,10 +724,10 @@ struct ConditionalJoinSingleReturnTest : public ConditionalJoinTest<T> {
   void _compare_to_hash_join(std::unique_ptr<rmm::device_uvector<cudf::size_type>> const& result,
                              std::unique_ptr<rmm::device_uvector<cudf::size_type>> const& reference)
   {
-    thrust::sort(rmm::exec_policy(cudf::default_stream_value), result->begin(), result->end());
+    thrust::sort(rmm::exec_policy(cudf::get_default_stream()), result->begin(), result->end());
     thrust::sort(
-      rmm::exec_policy(cudf::default_stream_value), reference->begin(), reference->end());
-    EXPECT_TRUE(thrust::equal(rmm::exec_policy(cudf::default_stream_value),
+      rmm::exec_policy(cudf::get_default_stream()), reference->begin(), reference->end());
+    EXPECT_TRUE(thrust::equal(rmm::exec_policy(cudf::get_default_stream()),
                               result->begin(),
                               result->end(),
                               reference->begin()));
diff --git a/cpp/tests/join/join_tests.cpp b/cpp/tests/join/join_tests.cpp
index 44e1d586389..ed4326055c6 100644
--- a/cpp/tests/join/join_tests.cpp
+++ b/cpp/tests/join/join_tests.cpp
@@ -1499,9 +1499,9 @@ TEST_F(JoinTest, HashJoinLargeOutputSize)
 {
   // self-join a table of zeroes to generate an output row count that would overflow int32_t
   std::size_t col_size = 65567;
-  rmm::device_buffer zeroes(col_size * sizeof(int32_t), cudf::default_stream_value);
+  rmm::device_buffer zeroes(col_size * sizeof(int32_t), cudf::get_default_stream());
   CUDF_CUDA_TRY(
-    cudaMemsetAsync(zeroes.data(), 0, zeroes.size(), cudf::default_stream_value.value()));
+    cudaMemsetAsync(zeroes.data(), 0, zeroes.size(), cudf::get_default_stream().value()));
   cudf::column_view col_zeros(cudf::data_type{cudf::type_id::INT32}, col_size, zeroes.data());
   cudf::table_view tview{{col_zeros}};
   cudf::hash_join hash_join(tview, cudf::null_equality::UNEQUAL);
@@ -1880,7 +1880,8 @@ TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown)
     auto fact_ints    = ints{0, 1, 2, 3, 4};
     auto fact_structs = structs{{fact_ints}, no_nulls()}.release();
     // Now set struct validity to invalidate index#3.
-    cudf::detail::set_null_mask(fact_structs->mutable_view().null_mask(), 3, 4, false);
+    cudf::detail::set_null_mask(
+      fact_structs->mutable_view().null_mask(), 3, 4, false, cudf::get_default_stream());
     // Struct row#3 is null, but Struct.child has a non-null value.
     return make_table(std::move(fact_structs));
   }();
@@ -1896,7 +1897,8 @@ TEST_F(JoinTest, Repro_StructsWithoutNullsPushedDown)
 
   // Note: Join result might not have nulls pushed down, since it's an output of gather().
   // Must superimpose parent nulls before comparisons.
-  auto [superimposed_results, _] = cudf::structs::detail::superimpose_parent_nulls(*result);
+  auto [superimposed_results, _] =
+    cudf::structs::detail::superimpose_parent_nulls(*result, cudf::get_default_stream());
 
   auto const expected = [] {
     auto fact_ints    = ints{0};
diff --git a/cpp/tests/join/mixed_join_tests.cu b/cpp/tests/join/mixed_join_tests.cu
index dbff5a1d8fc..d252ded6627 100644
--- a/cpp/tests/join/mixed_join_tests.cu
+++ b/cpp/tests/join/mixed_join_tests.cu
@@ -228,8 +228,8 @@ struct MixedJoinPairReturnTest : public MixedJoinTest<T> {
       // Note: Not trying to be terribly efficient here since these tests are
       // small, otherwise a batch copy to host before constructing the tuples
       // would be important.
-      result_pairs.push_back({result.first->element(i, cudf::default_stream_value),
-                              result.second->element(i, cudf::default_stream_value)});
+      result_pairs.push_back({result.first->element(i, cudf::get_default_stream()),
+                              result.second->element(i, cudf::get_default_stream())});
     }
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
@@ -586,8 +586,8 @@ struct MixedFullJoinTest : public MixedJoinPairReturnTest<T> {
       left_equality, right_equality, left_conditional, right_conditional, predicate, compare_nulls);
     std::vector<std::pair<cudf::size_type, cudf::size_type>> result_pairs;
     for (size_t i = 0; i < result.first->size(); ++i) {
-      result_pairs.push_back({result.first->element(i, cudf::default_stream_value),
-                              result.second->element(i, cudf::default_stream_value)});
+      result_pairs.push_back({result.first->element(i, cudf::get_default_stream()),
+                              result.second->element(i, cudf::get_default_stream())});
     }
     std::sort(result_pairs.begin(), result_pairs.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
@@ -666,7 +666,7 @@ struct MixedJoinSingleReturnTest : public MixedJoinTest<T> {
       // Note: Not trying to be terribly efficient here since these tests are
       // small, otherwise a batch copy to host before constructing the tuples
       // would be important.
-      resulting_indices.push_back(result->element(i, cudf::default_stream_value));
+      resulting_indices.push_back(result->element(i, cudf::get_default_stream()));
     }
     std::sort(resulting_indices.begin(), resulting_indices.end());
     std::sort(expected_outputs.begin(), expected_outputs.end());
diff --git a/cpp/tests/lists/combine/concatenate_rows_tests.cpp b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
index ed8bf8abb8d..2f268ff9996 100644
--- a/cpp/tests/lists/combine/concatenate_rows_tests.cpp
+++ b/cpp/tests/lists/combine/concatenate_rows_tests.cpp
@@ -547,9 +547,9 @@ TEST_F(ListConcatenateRowsNestedTypesTest, ListWithNulls)
   // list<list<string>>
 
   // clang-format off
-  
-  // col 0  
-  cudf::test::lists_column_wrapper<cudf::string_view> 
+
+  // col 0
+  cudf::test::lists_column_wrapper<cudf::string_view>
     l0{ {
           {{{"whee", "yay", "bananas"}, nulls_at({1})}, {}},
           {{}},
@@ -559,23 +559,23 @@ TEST_F(ListConcatenateRowsNestedTypesTest, ListWithNulls)
         }, nulls_at({3, 4}) };
 
   // col1
-  cudf::test::lists_column_wrapper<cudf::string_view> 
+  cudf::test::lists_column_wrapper<cudf::string_view>
     l1{ {
           {{}},
           {{"arg"}, {"mno", "ampere"}, {"gpu"}, {"def"}},
           {{{{"", "hhh"}, nulls_at({0})}, {"www"}},                               nulls_at({1})},
           {{"warp", "donuts", "parking"}, { "", "apply", "twelve", "mouse", "bbb"}, {"bbb", "pom"}, {}},
-          {{}} 
+          {{}}
         }, nulls_at({4}) };
 
   // col2
-  cudf::test::lists_column_wrapper<cudf::string_view> 
+  cudf::test::lists_column_wrapper<cudf::string_view>
     l2{ {
           {{"monitor", "sugar"}},
           {{"spurs", "garlic"}, {"onion", "shallot", "carrot"}},
           {{"cars", "trucks", "planes"}, {"abc"}, {"mno", "pqr"}},
           {{}, {"ram", "cpu", "disk"}, {}},
-          {{"round"}, {"square"}} 
+          {{"round"}, {"square"}}
         }, nulls_at({0, 4}) };
 
   // concatenate_policy::IGNORE_NULLS
@@ -584,39 +584,39 @@ TEST_F(ListConcatenateRowsNestedTypesTest, ListWithNulls)
     cudf::table_view t({l0, l1, l2});
     auto result = cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::IGNORE);
 
-    // expected  
+    // expected
     cudf::test::lists_column_wrapper<cudf::string_view>
-      expected{ {        
+      expected{ {
                   {{{"whee", "yay", "bananas"}, nulls_at({1})}, {}, {}},
                   {{}, {"arg"}, {"mno", "ampere"}, {"gpu"}, {"def"}, {"spurs", "garlic"}, {"onion", "shallot", "carrot"}},
-                  {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}, 
-                    {{"", "hhh"}, nulls_at({0})}, {"www"}, {"cars", "trucks", "planes"}, {"abc"}, {"mno", "pqr"}},                           
+                  {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"},
+                    {{"", "hhh"}, nulls_at({0})}, {"www"}, {"cars", "trucks", "planes"}, {"abc"}, {"mno", "pqr"}},
                       nulls_at({0, 2, 4}) },
                   {{"warp", "donuts", "parking"}, { "", "apply", "twelve", "mouse", "bbb"}, {"bbb", "pom"}, {}, {}, {"ram", "cpu", "disk"}, {}},
                   {{}}
                 }, nulls_at({4}) };
-        
+
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
-  
+
   // concatenate_policy::NULLIFY_OUTPUT_ROW
   {
     // perform the concatenate
     cudf::table_view t({l0, l1, l2});
     auto result = cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
 
-    // expected  
+    // expected
     cudf::test::lists_column_wrapper<cudf::string_view>
-      expected{ {        
+      expected{ {
                   {{}},
                   {{}, {"arg"}, {"mno", "ampere"}, {"gpu"}, {"def"}, {"spurs", "garlic"}, {"onion", "shallot", "carrot"}},
-                  {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}, 
-                    {{"", "hhh"}, nulls_at({0})}, {"www"}, {"cars", "trucks", "planes"}, {"abc"}, {"mno", "pqr"}},                           
+                  {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"},
+                    {{"", "hhh"}, nulls_at({0})}, {"www"}, {"cars", "trucks", "planes"}, {"abc"}, {"mno", "pqr"}},
                       nulls_at({0, 2, 4}) },
                   {{}},
                   {{}}
-                }, nulls_at({0, 3, 4}) };    
-        
+                }, nulls_at({0, 3, 4}) };
+
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
 
@@ -628,9 +628,9 @@ TEST_F(ListConcatenateRowsNestedTypesTest, ListWithNullsSliced)
   // list<list<string>>
 
   // clang-format off
-  
-  // col 0  
-  cudf::test::lists_column_wrapper<cudf::string_view> 
+
+  // col 0
+  cudf::test::lists_column_wrapper<cudf::string_view>
     unsliced_l0{ {
           {{{"whee", "yay", "bananas"}, nulls_at({1})}, {}},
           {{}},
@@ -641,13 +641,13 @@ TEST_F(ListConcatenateRowsNestedTypesTest, ListWithNullsSliced)
   auto l0 = cudf::split(unsliced_l0, {2})[1];
 
   // col1
-  cudf::test::lists_column_wrapper<cudf::string_view> 
+  cudf::test::lists_column_wrapper<cudf::string_view>
     unsliced_l1{ {
           {{}},
           {{"arg"}, {"mno", "ampere"}, {"gpu"}, {"def"}},
           {{{{"", "hhh"}, nulls_at({0})}, {"www"}},                               nulls_at({1})},
           {{"warp", "donuts", "parking"}, { "", "apply", "twelve", "mouse", "bbb"}, {"bbb", "pom"}, {}},
-          {{}} 
+          {{}}
         }, nulls_at({4}) };
   auto l1 = cudf::split(unsliced_l1, {2})[1];
 
@@ -657,14 +657,14 @@ TEST_F(ListConcatenateRowsNestedTypesTest, ListWithNullsSliced)
     cudf::table_view t({l0, l1});
     auto result = cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::IGNORE);
 
-    // expected  
+    // expected
     cudf::test::lists_column_wrapper<cudf::string_view>
-      expected{ { {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}, 
+      expected{ { {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"},
                     {{"", "hhh"}, nulls_at({0})}, {"www"}},                           nulls_at({0, 2, 4}) },
                   {{"warp", "donuts", "parking"}, { "", "apply", "twelve", "mouse", "bbb"}, {"bbb", "pom"}, {}},
                   {{}}
                 }, nulls_at({2}) };
-        
+
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
 
@@ -674,14 +674,14 @@ TEST_F(ListConcatenateRowsNestedTypesTest, ListWithNullsSliced)
     cudf::table_view t({l0, l1});
     auto result = cudf::lists::concatenate_rows(t, cudf::lists::concatenate_null_policy::NULLIFY_OUTPUT_ROW);
 
-    // expected  
+    // expected
     cudf::test::lists_column_wrapper<cudf::string_view>
-      expected{ { {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"}, 
+      expected{ { {{{"abc"}, {"def", "g", "xyw", "ijk"}, {"x", "y", "", "column"},
                     {{"", "hhh"}, nulls_at({0})}, {"www"}},                           nulls_at({0, 2, 4}) },
                   {{}},
-                  {{}} 
-                }, nulls_at({1, 2}) };    
-        
+                  {{}}
+                }, nulls_at({1, 2}) };
+
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, expected);
   }
 
diff --git a/cpp/tests/lists/contains_tests.cpp b/cpp/tests/lists/contains_tests.cpp
index a93ef4f8b1d..2139103500a 100644
--- a/cpp/tests/lists/contains_tests.cpp
+++ b/cpp/tests/lists/contains_tests.cpp
@@ -26,40 +26,37 @@
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-namespace cudf {
-namespace test {
-
 namespace {
 template <typename T, std::enable_if_t<cudf::is_numeric<T>(), void>* = nullptr>
 auto create_scalar_search_key(T const& value)
 {
-  auto search_key = make_numeric_scalar(data_type{type_to_id<T>()});
+  auto search_key = cudf::make_numeric_scalar(cudf::data_type{cudf::type_to_id<T>()});
   search_key->set_valid_async(true);
-  static_cast<scalar_type_t<T>*>(search_key.get())->set_value(value);
+  static_cast<cudf::scalar_type_t<T>*>(search_key.get())->set_value(value);
   return search_key;
 }
 
 template <typename T, std::enable_if_t<std::is_same_v<T, std::string>, void>* = nullptr>
 auto create_scalar_search_key(std::string const& value)
 {
-  return make_string_scalar(value);
+  return cudf::make_string_scalar(value);
 }
 
 template <typename T, std::enable_if_t<cudf::is_timestamp<T>(), void>* = nullptr>
 auto create_scalar_search_key(typename T::rep const& value)
 {
-  auto search_key = make_timestamp_scalar(data_type{type_to_id<T>()});
+  auto search_key = cudf::make_timestamp_scalar(cudf::data_type{cudf::type_to_id<T>()});
   search_key->set_valid_async(true);
-  static_cast<scalar_type_t<typename T::rep>*>(search_key.get())->set_value(value);
+  static_cast<cudf::scalar_type_t<typename T::rep>*>(search_key.get())->set_value(value);
   return search_key;
 }
 
 template <typename T, std::enable_if_t<cudf::is_duration<T>(), void>* = nullptr>
 auto create_scalar_search_key(typename T::rep const& value)
 {
-  auto search_key = make_duration_scalar(data_type{type_to_id<T>()});
+  auto search_key = cudf::make_duration_scalar(cudf::data_type{cudf::type_to_id<T>()});
   search_key->set_valid_async(true);
-  static_cast<scalar_type_t<typename T::rep>*>(search_key.get())->set_value(value);
+  static_cast<cudf::scalar_type_t<typename T::rep>*>(search_key.get())->set_value(value);
   return search_key;
 }
 
@@ -72,7 +69,7 @@ auto make_struct_scalar(Args&&... args)
 template <typename T, std::enable_if_t<cudf::is_numeric<T>(), void>* = nullptr>
 auto create_null_search_key()
 {
-  auto search_key = make_numeric_scalar(data_type{type_to_id<T>()});
+  auto search_key = cudf::make_numeric_scalar(cudf::data_type{cudf::type_to_id<T>()});
   search_key->set_valid_async(false);
   return search_key;
 }
@@ -80,7 +77,7 @@ auto create_null_search_key()
 template <typename T, std::enable_if_t<cudf::is_timestamp<T>(), void>* = nullptr>
 auto create_null_search_key()
 {
-  auto search_key = make_timestamp_scalar(data_type{type_to_id<T>()});
+  auto search_key = cudf::make_timestamp_scalar(cudf::data_type{cudf::type_to_id<T>()});
   search_key->set_valid_async(false);
   return search_key;
 }
@@ -88,30 +85,29 @@ auto create_null_search_key()
 template <typename T, std::enable_if_t<cudf::is_duration<T>(), void>* = nullptr>
 auto create_null_search_key()
 {
-  auto search_key = make_duration_scalar(data_type{type_to_id<T>()});
+  auto search_key = cudf::make_duration_scalar(cudf::data_type{cudf::type_to_id<T>()});
   search_key->set_valid_async(false);
   return search_key;
 }
 
 }  // namespace
 
-auto constexpr X          = int32_t{0};     // Placeholder for nulls.
-auto constexpr ABSENT     = size_type{-1};  // Index when key is not found in a list.
-auto constexpr FIND_FIRST = lists::duplicate_find_option::FIND_FIRST;
-auto constexpr FIND_LAST  = lists::duplicate_find_option::FIND_LAST;
+auto constexpr X          = int32_t{0};           // Placeholder for nulls.
+auto constexpr ABSENT     = cudf::size_type{-1};  // Index when key is not found in a list.
+auto constexpr FIND_FIRST = cudf::lists::duplicate_find_option::FIND_FIRST;
+auto constexpr FIND_LAST  = cudf::lists::duplicate_find_option::FIND_LAST;
 
 using bools_col   = cudf::test::fixed_width_column_wrapper<bool, int32_t>;
-using indices_col = cudf::test::fixed_width_column_wrapper<size_type>;
-using structs_col = cudf::test::structs_column_wrapper;
-using strings_col = cudf::test::strings_column_wrapper;
+using indices_col = cudf::test::fixed_width_column_wrapper<cudf::size_type>;
 
-using iterators::all_nulls;
-using iterators::null_at;
-using iterators::nulls_at;
+using cudf::test::iterators::all_nulls;
+using cudf::test::iterators::null_at;
+using cudf::test::iterators::nulls_at;
 
-using ContainsTestTypes = Concat<IntegralTypesNotBool, FloatingPointTypes, ChronoTypes>;
+using ContainsTestTypes = cudf::test::
+  Concat<cudf::test::IntegralTypesNotBool, cudf::test::FloatingPointTypes, cudf::test::ChronoTypes>;
 
-struct ContainsTest : public BaseFixture {
+struct ContainsTest : public cudf::test::BaseFixture {
 };
 
 template <typename T>
@@ -124,40 +120,40 @@ TYPED_TEST(TypedContainsTest, ScalarKeyWithNoNulls)
 {
   using T = TypeParam;
 
-  auto const search_space_col = lists_column_wrapper<T, int32_t>{{0, 1, 2, 1},
-                                                                 {3, 4, 5},
-                                                                 {6, 7, 8},
-                                                                 {9, 0, 1, 3, 1},
-                                                                 {2, 3, 4},
-                                                                 {5, 6, 7},
-                                                                 {8, 9, 0},
-                                                                 {},
-                                                                 {1, 2, 1, 3},
-                                                                 {}};
-  auto const search_space     = lists_column_view{search_space_col};
+  auto const search_space_col = cudf::test::lists_column_wrapper<T, int32_t>{{0, 1, 2, 1},
+                                                                             {3, 4, 5},
+                                                                             {6, 7, 8},
+                                                                             {9, 0, 1, 3, 1},
+                                                                             {2, 3, 4},
+                                                                             {5, 6, 7},
+                                                                             {8, 9, 0},
+                                                                             {},
+                                                                             {1, 2, 1, 3},
+                                                                             {}};
+  auto const search_space     = cudf::lists_column_view{search_space_col};
   auto search_key_one         = create_scalar_search_key<T>(1);
 
   {
     // CONTAINS
-    auto result   = lists::contains(search_space, *search_key_one);
+    auto result   = cudf::lists::contains(search_space, *search_key_one);
     auto expected = bools_col{1, 0, 0, 1, 0, 0, 0, 0, 1, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // CONTAINS NULLS
-    auto result   = lists::contains_nulls(search_space);
+    auto result   = cudf::lists::contains_nulls(search_space);
     auto expected = bools_col{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space, *search_key_one, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space, *search_key_one, FIND_FIRST);
     auto expected = indices_col{1, ABSENT, ABSENT, 2, ABSENT, ABSENT, ABSENT, ABSENT, 0, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space, *search_key_one, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space, *search_key_one, FIND_LAST);
     auto expected = indices_col{3, ABSENT, ABSENT, 4, ABSENT, ABSENT, ABSENT, ABSENT, 2, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -168,42 +164,42 @@ TYPED_TEST(TypedContainsTest, ScalarKeyWithNullLists)
   // Test List columns that have NULL list rows.
   using T = TypeParam;
 
-  auto const search_space_col = lists_column_wrapper<T, int32_t>{{{0, 1, 2, 1},
-                                                                  {3, 4, 5},
-                                                                  {6, 7, 8},
-                                                                  {},
-                                                                  {9, 0, 1, 3, 1},
-                                                                  {2, 3, 4},
-                                                                  {5, 6, 7},
-                                                                  {8, 9, 0},
-                                                                  {},
-                                                                  {1, 2, 2, 3},
-                                                                  {}},
-                                                                 nulls_at({3, 10})};
-  auto const search_space     = lists_column_view{search_space_col};
+  auto const search_space_col = cudf::test::lists_column_wrapper<T, int32_t>{{{0, 1, 2, 1},
+                                                                              {3, 4, 5},
+                                                                              {6, 7, 8},
+                                                                              {},
+                                                                              {9, 0, 1, 3, 1},
+                                                                              {2, 3, 4},
+                                                                              {5, 6, 7},
+                                                                              {8, 9, 0},
+                                                                              {},
+                                                                              {1, 2, 2, 3},
+                                                                              {}},
+                                                                             nulls_at({3, 10})};
+  auto const search_space     = cudf::lists_column_view{search_space_col};
   auto search_key_one         = create_scalar_search_key<T>(1);
   {
     // CONTAINS
-    auto result   = lists::contains(search_space, *search_key_one);
+    auto result   = cudf::lists::contains(search_space, *search_key_one);
     auto expected = bools_col{{1, 0, 0, X, 1, 0, 0, 0, 0, 1, X}, nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // CONTAINS NULLS
-    auto result   = lists::contains_nulls(search_space);
+    auto result   = cudf::lists::contains_nulls(search_space);
     auto expected = bools_col{{0, 0, 0, X, 0, 0, 0, 0, 0, 0, X}, nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space, *search_key_one, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space, *search_key_one, FIND_FIRST);
     auto expected = indices_col{{1, ABSENT, ABSENT, X, 2, ABSENT, ABSENT, ABSENT, ABSENT, 0, X},
                                 nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space, *search_key_one, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space, *search_key_one, FIND_LAST);
     auto expected = indices_col{{3, ABSENT, ABSENT, X, 4, ABSENT, ABSENT, ABSENT, ABSENT, 0, X},
                                 nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
@@ -216,45 +212,46 @@ TYPED_TEST(TypedContainsTest, SlicedLists)
   using namespace cudf;
   using T = TypeParam;
 
-  auto search_space = lists_column_wrapper<T, int32_t>{{{0, 1, 2, 1},
-                                                        {3, 4, 5},
-                                                        {6, 7, 8},
-                                                        {},
-                                                        {9, 0, 1, 3, 1},
-                                                        {2, 3, 4},
-                                                        {5, 6, 7},
-                                                        {8, 9, 0},
-                                                        {},
-                                                        {1, 2, 1, 3},
-                                                        {}},
-                                                       nulls_at({3, 10})};
+  auto search_space = cudf::test::lists_column_wrapper<T, int32_t>{{{0, 1, 2, 1},
+                                                                    {3, 4, 5},
+                                                                    {6, 7, 8},
+                                                                    {},
+                                                                    {9, 0, 1, 3, 1},
+                                                                    {2, 3, 4},
+                                                                    {5, 6, 7},
+                                                                    {8, 9, 0},
+                                                                    {},
+                                                                    {1, 2, 1, 3},
+                                                                    {}},
+                                                                   nulls_at({3, 10})};
 
   {
     // First Slice.
-    auto sliced_column_1 = cudf::detail::slice(search_space, {1, 8}).front();
-    auto search_key_one  = create_scalar_search_key<T>(1);
+    auto sliced_column_1 =
+      cudf::detail::slice(search_space, {1, 8}, cudf::get_default_stream()).front();
+    auto search_key_one = create_scalar_search_key<T>(1);
     {
       // CONTAINS
-      auto result          = lists::contains(sliced_column_1, *search_key_one);
+      auto result          = cudf::lists::contains(sliced_column_1, *search_key_one);
       auto expected_result = bools_col{{0, 0, X, 1, 0, 0, 0}, null_at(2)};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
     }
     {
       // CONTAINS NULLS
-      auto result          = lists::contains_nulls(sliced_column_1);
+      auto result          = cudf::lists::contains_nulls(sliced_column_1);
       auto expected_result = bools_col{{0, 0, X, 0, 0, 0, 0}, null_at(2)};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
     }
     {
       // FIND_FIRST
-      auto result = lists::index_of(sliced_column_1, *search_key_one, FIND_FIRST);
+      auto result = cudf::lists::index_of(sliced_column_1, *search_key_one, FIND_FIRST);
       auto expected_result =
         indices_col{{ABSENT, ABSENT, 0, 2, ABSENT, ABSENT, ABSENT}, null_at(2)};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
     }
     {
       // FIND_LAST
-      auto result = lists::index_of(sliced_column_1, *search_key_one, FIND_LAST);
+      auto result = cudf::lists::index_of(sliced_column_1, *search_key_one, FIND_LAST);
       auto expected_result =
         indices_col{{ABSENT, ABSENT, 0, 4, ABSENT, ABSENT, ABSENT}, null_at(2)};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
@@ -263,29 +260,30 @@ TYPED_TEST(TypedContainsTest, SlicedLists)
 
   {
     // Second Slice.
-    auto sliced_column_2 = cudf::detail::slice(search_space, {3, 10}).front();
-    auto search_key_one  = create_scalar_search_key<T>(1);
+    auto sliced_column_2 =
+      cudf::detail::slice(search_space, {3, 10}, cudf::get_default_stream()).front();
+    auto search_key_one = create_scalar_search_key<T>(1);
     {
       // CONTAINS
-      auto result          = lists::contains(sliced_column_2, *search_key_one);
+      auto result          = cudf::lists::contains(sliced_column_2, *search_key_one);
       auto expected_result = bools_col{{X, 1, 0, 0, 0, 0, 1}, null_at(0)};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
     }
     {
       // CONTAINS NULLS
-      auto result          = lists::contains_nulls(sliced_column_2);
+      auto result          = cudf::lists::contains_nulls(sliced_column_2);
       auto expected_result = bools_col{{X, 0, 0, 0, 0, 0, 0}, null_at(0)};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
     }
     {
       // FIND_FIRST
-      auto result          = lists::index_of(sliced_column_2, *search_key_one, FIND_FIRST);
+      auto result          = cudf::lists::index_of(sliced_column_2, *search_key_one, FIND_FIRST);
       auto expected_result = indices_col{{0, 2, ABSENT, ABSENT, ABSENT, ABSENT, 0}, null_at(0)};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
     }
     {
       // FIND_LAST
-      auto result          = lists::index_of(sliced_column_2, *search_key_one, FIND_LAST);
+      auto result          = cudf::lists::index_of(sliced_column_2, *search_key_one, FIND_LAST);
       auto expected_result = indices_col{{0, 4, ABSENT, ABSENT, ABSENT, ABSENT, 2}, null_at(0)};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result, result->view());
     }
@@ -297,33 +295,33 @@ TYPED_TEST(TypedContainsTest, ScalarKeyNonNullListsWithNullValues)
   // Test List columns that have no NULL list rows, but NULL elements in some list rows.
   using T = TypeParam;
 
-  auto numerals     = fixed_width_column_wrapper<T>{{X, 1, 2, X, 4, 5, X, 7, 8, X, X, 1, 2, X, 1},
-                                                nulls_at({0, 3, 6, 9, 10, 13})};
-  auto search_space = make_lists_column(
+  auto numerals = cudf::test::fixed_width_column_wrapper<T>{
+    {X, 1, 2, X, 4, 5, X, 7, 8, X, X, 1, 2, X, 1}, nulls_at({0, 3, 6, 9, 10, 13})};
+  auto search_space = cudf::make_lists_column(
     8, indices_col{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 0, {});
   // Search space: [ [x], [1,2], [x,4,5,x], [], [], [7,8,x], [x], [1,2,x,1] ]
   auto search_key_one = create_scalar_search_key<T>(1);
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), *search_key_one);
+    auto result   = cudf::lists::contains(search_space->view(), *search_key_one);
     auto expected = bools_col{0, 1, 0, 0, 0, 0, 0, 1};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // CONTAINS NULLS
-    auto result   = lists::contains_nulls(search_space->view());
+    auto result   = cudf::lists::contains_nulls(search_space->view());
     auto expected = bools_col{1, 0, 1, 0, 0, 1, 1, 1};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
     auto expected = indices_col{ABSENT, 0, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
     auto expected = indices_col{ABSENT, 0, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 3};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -333,11 +331,11 @@ TYPED_TEST(TypedContainsTest, ScalarKeysWithNullsInLists)
 {
   using T = TypeParam;
 
-  auto numerals = fixed_width_column_wrapper<T>{{X, 1, 2, X, 4, 5, X, 7, 8, X, X, 1, 2, X, 1},
-                                                nulls_at({0, 3, 6, 9, 10, 13})};
+  auto numerals = cudf::test::fixed_width_column_wrapper<T>{
+    {X, 1, 2, X, 4, 5, X, 7, 8, X, X, 1, 2, X, 1}, nulls_at({0, 3, 6, 9, 10, 13})};
   auto input_null_mask_iter = null_at(4);
 
-  auto search_space = make_lists_column(
+  auto search_space = cudf::make_lists_column(
     8,
     indices_col{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     numerals.release(),
@@ -348,25 +346,25 @@ TYPED_TEST(TypedContainsTest, ScalarKeysWithNullsInLists)
   auto search_key_one = create_scalar_search_key<T>(1);
   {
     // CONTAINS.
-    auto result   = lists::contains(search_space->view(), *search_key_one);
+    auto result   = cudf::lists::contains(search_space->view(), *search_key_one);
     auto expected = bools_col{{0, 1, 0, 0, X, 0, 0, 1}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // CONTAINS NULLS.
-    auto result   = lists::contains_nulls(search_space->view());
+    auto result   = cudf::lists::contains_nulls(search_space->view());
     auto expected = bools_col{{1, 0, 1, 0, X, 1, 1, 1}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST.
-    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
     auto expected = indices_col{{ABSENT, 0, ABSENT, ABSENT, X, ABSENT, ABSENT, 0}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST.
-    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
     auto expected = indices_col{{ABSENT, 0, ABSENT, ABSENT, X, ABSENT, ABSENT, 3}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -376,12 +374,12 @@ TEST_F(ContainsTest, BoolScalarWithNullsInLists)
 {
   using T = bool;
 
-  auto numerals = fixed_width_column_wrapper<T>{{X, 1, 1, X, 1, 1, X, 1, 1, X, X, 1, 1, X, 1},
-                                                nulls_at({0, 3, 6, 9, 10, 13})};
+  auto numerals = cudf::test::fixed_width_column_wrapper<T>{
+    {X, 1, 1, X, 1, 1, X, 1, 1, X, X, 1, 1, X, 1}, nulls_at({0, 3, 6, 9, 10, 13})};
   auto input_null_mask_iter = null_at(4);
-  auto search_space         = make_lists_column(
+  auto search_space         = cudf::make_lists_column(
     8,
-    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     numerals.release(),
     1,
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
@@ -390,25 +388,25 @@ TEST_F(ContainsTest, BoolScalarWithNullsInLists)
   auto search_key_one = create_scalar_search_key<T>(1);
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), *search_key_one);
+    auto result   = cudf::lists::contains(search_space->view(), *search_key_one);
     auto expected = bools_col{{0, 1, 1, 0, X, 1, 0, 1}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // CONTAINS NULLS
-    auto result   = lists::contains_nulls(search_space->view());
+    auto result   = cudf::lists::contains_nulls(search_space->view());
     auto expected = bools_col{{1, 0, 1, 0, X, 1, 1, 1}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST.
-    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
     auto expected = indices_col{{ABSENT, 0, 1, ABSENT, X, 0, ABSENT, 0}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST.
-    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
     auto expected = indices_col{{ABSENT, 1, 2, ABSENT, X, 1, ABSENT, 3}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -418,11 +416,11 @@ TEST_F(ContainsTest, StringScalarWithNullsInLists)
 {
   using T = std::string;
 
-  auto strings = strings_column_wrapper{
+  auto strings = cudf::test::strings_column_wrapper{
     {"X", "1", "2", "X", "4", "5", "X", "7", "8", "X", "X", "1", "2", "X", "1"},
     nulls_at({0, 3, 6, 9, 10, 13})};
   auto input_null_mask_iter = null_at(4);
-  auto search_space         = make_lists_column(
+  auto search_space         = cudf::make_lists_column(
     8,
     indices_col{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     strings.release(),
@@ -433,25 +431,25 @@ TEST_F(ContainsTest, StringScalarWithNullsInLists)
   auto search_key_one = create_scalar_search_key<T>("1");
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), *search_key_one);
+    auto result   = cudf::lists::contains(search_space->view(), *search_key_one);
     auto expected = bools_col{{0, 1, 0, 0, X, 0, 0, 1}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // CONTAINS NULLS
-    auto result   = lists::contains_nulls(search_space->view());
+    auto result   = cudf::lists::contains_nulls(search_space->view());
     auto expected = bools_col{{1, 0, 1, 0, X, 1, 1, 1}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST.
-    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
     auto expected = indices_col{{ABSENT, 0, ABSENT, ABSENT, X, ABSENT, ABSENT, 0}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST.
-    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
     auto expected = indices_col{{ABSENT, 0, ABSENT, ABSENT, X, ABSENT, ABSENT, 3}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -461,35 +459,35 @@ TYPED_TEST(TypedContainsTest, ScalarNullSearchKey)
 {
   using T = TypeParam;
 
-  auto search_space = lists_column_wrapper<T, int32_t>{{{0, 1, 2},
-                                                        {3, 4, 5},
-                                                        {6, 7, 8},
-                                                        {},
-                                                        {9, 0, 1},
-                                                        {2, 3, 4},
-                                                        {5, 6, 7},
-                                                        {8, 9, 0},
-                                                        {},
-                                                        {1, 2, 3},
-                                                        {}},
-                                                       nulls_at({3, 10})}
+  auto search_space = cudf::test::lists_column_wrapper<T, int32_t>{{{0, 1, 2},
+                                                                    {3, 4, 5},
+                                                                    {6, 7, 8},
+                                                                    {},
+                                                                    {9, 0, 1},
+                                                                    {2, 3, 4},
+                                                                    {5, 6, 7},
+                                                                    {8, 9, 0},
+                                                                    {},
+                                                                    {1, 2, 3},
+                                                                    {}},
+                                                                   nulls_at({3, 10})}
                         .release();
   auto search_key_null = create_null_search_key<T>();
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), *search_key_null);
+    auto result   = cudf::lists::contains(search_space->view(), *search_key_null);
     auto expected = bools_col{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, all_nulls()};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), *search_key_null, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_null, FIND_FIRST);
     auto expected = indices_col{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, all_nulls()};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), *search_key_null, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_null, FIND_LAST);
     auto expected = indices_col{{0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0}, all_nulls()};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -499,28 +497,29 @@ TEST_F(ContainsTest, ScalarTypeRelatedExceptions)
 {
   {
     // Nested types unsupported.
-    auto list_of_lists = lists_column_wrapper<int32_t>{
+    auto list_of_lists = cudf::test::lists_column_wrapper<int32_t>{
       {{1, 2, 3}, {4, 5, 6}},
       {{1, 2, 3}, {4, 5, 6}},
       {{1, 2, 3},
        {4, 5, 6}}}.release();
     auto skey = create_scalar_search_key<int32_t>(10);
-    EXPECT_THROW(lists::contains(list_of_lists->view(), *skey), cudf::logic_error);
-    EXPECT_THROW(lists::index_of(list_of_lists->view(), *skey, FIND_FIRST), cudf::logic_error);
-    EXPECT_THROW(lists::index_of(list_of_lists->view(), *skey, FIND_LAST), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::contains(list_of_lists->view(), *skey), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::index_of(list_of_lists->view(), *skey, FIND_FIRST),
+                 cudf::logic_error);
+    EXPECT_THROW(cudf::lists::index_of(list_of_lists->view(), *skey, FIND_LAST), cudf::logic_error);
   }
   {
     // Search key must match list elements in type.
     auto list_of_ints =
-      lists_column_wrapper<int32_t>{
+      cudf::test::lists_column_wrapper<int32_t>{
         {0, 1, 2},
         {3, 4, 5},
       }
         .release();
     auto skey = create_scalar_search_key<std::string>("Hello, World!");
-    EXPECT_THROW(lists::contains(list_of_ints->view(), *skey), cudf::logic_error);
-    EXPECT_THROW(lists::index_of(list_of_ints->view(), *skey, FIND_FIRST), cudf::logic_error);
-    EXPECT_THROW(lists::index_of(list_of_ints->view(), *skey, FIND_LAST), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::contains(list_of_ints->view(), *skey), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::index_of(list_of_ints->view(), *skey, FIND_FIRST), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::index_of(list_of_ints->view(), *skey, FIND_LAST), cudf::logic_error);
   }
 }
 
@@ -537,7 +536,7 @@ TYPED_TEST(TypedVectorContainsTest, VectorKeysWithNoNulls)
 {
   using T = TypeParam;
 
-  auto search_space = lists_column_wrapper<T, int32_t>{
+  auto search_space = cudf::test::lists_column_wrapper<T, int32_t>{
     {0, 1, 2, 1},
     {3, 4, 5},
     {6, 7, 8},
@@ -549,22 +548,23 @@ TYPED_TEST(TypedVectorContainsTest, VectorKeysWithNoNulls)
     {1, 2, 3, 3},
     {}}.release();
 
-  auto search_key = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 2, 3, 1};
+  auto search_key =
+    cudf::test::fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 2, 3, 1};
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), search_key);
+    auto result   = cudf::lists::contains(search_space->view(), search_key);
     auto expected = bools_col{1, 0, 0, 1, 1, 0, 0, 0, 1, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), search_key, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_key, FIND_FIRST);
     auto expected = indices_col{1, ABSENT, ABSENT, 2, 0, ABSENT, ABSENT, ABSENT, 2, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), search_key, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_key, FIND_LAST);
     auto expected = indices_col{3, ABSENT, ABSENT, 4, 0, ABSENT, ABSENT, ABSENT, 3, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -576,38 +576,39 @@ TYPED_TEST(TypedVectorContainsTest, VectorWithNullLists)
 
   using T = TypeParam;
 
-  auto search_space = lists_column_wrapper<T, int32_t>{{{0, 1, 2, 1},
-                                                        {3, 4, 5},
-                                                        {6, 7, 8},
-                                                        {},
-                                                        {9, 0, 1, 3, 1},
-                                                        {2, 3, 4},
-                                                        {5, 6, 7},
-                                                        {8, 9, 0},
-                                                        {},
-                                                        {1, 2, 3, 3},
-                                                        {}},
-                                                       nulls_at({3, 10})}
+  auto search_space = cudf::test::lists_column_wrapper<T, int32_t>{{{0, 1, 2, 1},
+                                                                    {3, 4, 5},
+                                                                    {6, 7, 8},
+                                                                    {},
+                                                                    {9, 0, 1, 3, 1},
+                                                                    {2, 3, 4},
+                                                                    {5, 6, 7},
+                                                                    {8, 9, 0},
+                                                                    {},
+                                                                    {1, 2, 3, 3},
+                                                                    {}},
+                                                                   nulls_at({3, 10})}
                         .release();
 
-  auto search_keys = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2};
+  auto search_keys =
+    cudf::test::fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 2, 3, 1, 2};
 
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), search_keys);
+    auto result   = cudf::lists::contains(search_space->view(), search_keys);
     auto expected = bools_col{{1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0}, nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_FIRST);
     auto expected = indices_col{{1, ABSENT, ABSENT, X, ABSENT, 1, ABSENT, ABSENT, ABSENT, 0, X},
                                 nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_LAST);
     auto expected = indices_col{{3, ABSENT, ABSENT, X, ABSENT, 1, ABSENT, ABSENT, ABSENT, 0, X},
                                 nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
@@ -619,28 +620,28 @@ TYPED_TEST(TypedVectorContainsTest, VectorNonNullListsWithNullValues)
   // Test List columns that have no NULL list rows, but NULL elements in some list rows.
   using T = TypeParam;
 
-  auto numerals = fixed_width_column_wrapper<T>{{X, 1, 2, X, 4, 5, X, 7, 8, X, X, 1, 2, X, 1},
-                                                nulls_at({0, 3, 6, 9, 10, 13})};
+  auto numerals = cudf::test::fixed_width_column_wrapper<T>{
+    {X, 1, 2, X, 4, 5, X, 7, 8, X, X, 1, 2, X, 1}, nulls_at({0, 3, 6, 9, 10, 13})};
 
-  auto search_space = make_lists_column(
+  auto search_space = cudf::make_lists_column(
     8, indices_col{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(), numerals.release(), 0, {});
   // Search space: [ [x], [1,2], [x,4,5,x], [], [], [7,8,x], [x], [1,2,x,1] ]
-  auto search_keys = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 1};
+  auto search_keys = cudf::test::fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 1};
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), search_keys);
+    auto result   = cudf::lists::contains(search_space->view(), search_keys);
     auto expected = bools_col{0, 1, 0, 0, 0, 0, 0, 1};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_FIRST);
     auto expected = indices_col{ABSENT, 1, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_LAST);
     auto expected = indices_col{ABSENT, 1, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 3};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -650,12 +651,12 @@ TYPED_TEST(TypedVectorContainsTest, VectorWithNullsInLists)
 {
   using T = TypeParam;
 
-  auto numerals = fixed_width_column_wrapper<T>{{X, 1, 2, X, 4, 5, X, 7, 8, X, X, 1, 2, X, 1},
-                                                nulls_at({0, 3, 6, 9, 10, 13})};
+  auto numerals = cudf::test::fixed_width_column_wrapper<T>{
+    {X, 1, 2, X, 4, 5, X, 7, 8, X, X, 1, 2, X, 1}, nulls_at({0, 3, 6, 9, 10, 13})};
 
   auto input_null_mask_iter = null_at(4);
 
-  auto search_space = make_lists_column(
+  auto search_space = cudf::make_lists_column(
     8,
     indices_col{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     numerals.release(),
@@ -663,22 +664,22 @@ TYPED_TEST(TypedVectorContainsTest, VectorWithNullsInLists)
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
   // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ]
 
-  auto search_keys = fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 1};
+  auto search_keys = cudf::test::fixed_width_column_wrapper<T, int32_t>{1, 2, 3, 1, 2, 3, 1, 1};
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), search_keys);
+    auto result   = cudf::lists::contains(search_space->view(), search_keys);
     auto expected = bools_col{{0, 1, 0, 0, X, 0, 0, 1}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_FIRST);
     auto expected = indices_col{{ABSENT, 1, ABSENT, ABSENT, X, ABSENT, ABSENT, 0}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_LAST);
     auto expected = indices_col{{ABSENT, 1, ABSENT, ABSENT, X, ABSENT, ABSENT, 3}, null_at(4)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -688,12 +689,12 @@ TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullsInListsAndInSearc
 {
   using T = TypeParam;
 
-  auto numerals = fixed_width_column_wrapper<T>{{X, 1, 2, X, 4, 5, X, 7, 8, X, X, 1, 2, X, 1},
-                                                nulls_at({0, 3, 6, 9, 10, 13})};
+  auto numerals = cudf::test::fixed_width_column_wrapper<T>{
+    {X, 1, 2, X, 4, 5, X, 7, 8, X, X, 1, 2, X, 1}, nulls_at({0, 3, 6, 9, 10, 13})};
 
   auto input_null_mask_iter = null_at(4);
 
-  auto search_space = make_lists_column(
+  auto search_space = cudf::make_lists_column(
     8,
     indices_col{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     numerals.release(),
@@ -701,22 +702,23 @@ TYPED_TEST(TypedVectorContainsTest, ListContainsVectorWithNullsInListsAndInSearc
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
   // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ]
 
-  auto search_keys = fixed_width_column_wrapper<T, int32_t>{{1, 2, 3, X, 2, 3, 1, 1}, null_at(3)};
+  auto search_keys =
+    cudf::test::fixed_width_column_wrapper<T, int32_t>{{1, 2, 3, X, 2, 3, 1, 1}, null_at(3)};
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), search_keys);
+    auto result   = cudf::lists::contains(search_space->view(), search_keys);
     auto expected = bools_col{{0, 1, 0, X, X, 0, 0, 1}, nulls_at({3, 4})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_FIRST);
     auto expected = indices_col{{ABSENT, 1, ABSENT, X, X, ABSENT, ABSENT, 0}, nulls_at({3, 4})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_LAST);
     auto expected = indices_col{{ABSENT, 1, ABSENT, X, X, ABSENT, ABSENT, 3}, nulls_at({3, 4})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -726,36 +728,37 @@ TEST_F(ContainsTest, BoolKeyVectorWithNullsInListsAndInSearchKeys)
 {
   using T = bool;
 
-  auto numerals = fixed_width_column_wrapper<T>{{X, 0, 1, X, 1, 1, X, 1, 1, X, X, 0, 1, X, 1},
-                                                nulls_at({0, 3, 6, 9, 10, 13})};
+  auto numerals = cudf::test::fixed_width_column_wrapper<T>{
+    {X, 0, 1, X, 1, 1, X, 1, 1, X, X, 0, 1, X, 1}, nulls_at({0, 3, 6, 9, 10, 13})};
 
   auto input_null_mask_iter = null_at(4);
 
-  auto search_space = make_lists_column(
+  auto search_space = cudf::make_lists_column(
     8,
     indices_col{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     numerals.release(),
     1,
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
 
-  auto search_keys = fixed_width_column_wrapper<T, int32_t>{{0, 1, 0, X, 0, 0, 1, 1}, null_at(3)};
+  auto search_keys =
+    cudf::test::fixed_width_column_wrapper<T, int32_t>{{0, 1, 0, X, 0, 0, 1, 1}, null_at(3)};
   // Search space: [ [x], [0,1], [x,1,1,x], [], x, [1,1,x], [x], [0,1,x,1] ]
   // Search keys : [  0,   1,     0,         x, 0,  0,       1,   1        ]
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), search_keys);
+    auto result   = cudf::lists::contains(search_space->view(), search_keys);
     auto expected = bools_col{{0, 1, 0, X, X, 0, 0, 1}, nulls_at({3, 4})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_FIRST);
     auto expected = indices_col{{ABSENT, 1, ABSENT, X, X, ABSENT, ABSENT, 1}, nulls_at({3, 4})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_LAST);
     auto expected = indices_col{{ABSENT, 1, ABSENT, X, X, ABSENT, ABSENT, 3}, nulls_at({3, 4})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -763,37 +766,38 @@ TEST_F(ContainsTest, BoolKeyVectorWithNullsInListsAndInSearchKeys)
 
 TEST_F(ContainsTest, StringKeyVectorWithNullsInListsAndInSearchKeys)
 {
-  auto strings = strings_column_wrapper{
+  auto strings = cudf::test::strings_column_wrapper{
     {"X", "1", "2", "X", "4", "5", "X", "7", "8", "X", "X", "1", "2", "X", "1"},
     nulls_at({0, 3, 6, 9, 10, 13})};
   auto input_null_mask_iter = null_at(4);
-  auto search_space         = make_lists_column(
+  auto search_space         = cudf::make_lists_column(
     8,
-    fixed_width_column_wrapper<size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 1, 3, 7, 7, 7, 10, 11, 15}.release(),
     strings.release(),
     1,
     cudf::test::detail::make_null_mask(input_null_mask_iter, input_null_mask_iter + 8));
 
-  auto search_keys = strings_column_wrapper{{"1", "2", "3", "X", "2", "3", "1", "1"}, null_at(3)};
+  auto search_keys =
+    cudf::test::strings_column_wrapper{{"1", "2", "3", "X", "2", "3", "1", "1"}, null_at(3)};
 
   // Search space: [ [x], [1,2], [x,4,5,x], [], x, [7,8,x], [x], [1,2,x,1] ]
   // Search keys:  [  1,   2,     3,         X, 2,  3,       1,   1]
 
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), search_keys);
+    auto result   = cudf::lists::contains(search_space->view(), search_keys);
     auto expected = bools_col{{0, 1, 0, X, X, 0, 0, 1}, nulls_at({3, 4})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_FIRST);
     auto expected = indices_col{{ABSENT, 1, ABSENT, X, X, ABSENT, ABSENT, 0}, nulls_at({3, 4})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), search_keys, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_keys, FIND_LAST);
     auto expected = indices_col{{ABSENT, 1, ABSENT, X, X, ABSENT, ABSENT, 3}, nulls_at({3, 4})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -803,36 +807,37 @@ TEST_F(ContainsTest, VectorTypeRelatedExceptions)
 {
   {
     // Nested types unsupported.
-    auto list_of_lists = lists_column_wrapper<int32_t>{
+    auto list_of_lists = cudf::test::lists_column_wrapper<int32_t>{
       {{1, 2, 3}, {4, 5, 6}},
       {{1, 2, 3}, {4, 5, 6}},
       {{1, 2, 3},
        {4, 5, 6}}}.release();
-    auto skey = fixed_width_column_wrapper<int32_t>{0, 1, 2};
-    EXPECT_THROW(lists::contains(list_of_lists->view(), skey), cudf::logic_error);
-    EXPECT_THROW(lists::index_of(list_of_lists->view(), skey, FIND_FIRST), cudf::logic_error);
-    EXPECT_THROW(lists::index_of(list_of_lists->view(), skey, FIND_LAST), cudf::logic_error);
+    auto skey = cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2};
+    EXPECT_THROW(cudf::lists::contains(list_of_lists->view(), skey), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::index_of(list_of_lists->view(), skey, FIND_FIRST), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::index_of(list_of_lists->view(), skey, FIND_LAST), cudf::logic_error);
   }
   {
     // Search key must match list elements in type.
     auto list_of_ints =
-      lists_column_wrapper<int32_t>{
+      cudf::test::lists_column_wrapper<int32_t>{
         {0, 1, 2},
         {3, 4, 5},
       }
         .release();
-    auto skey = strings_column_wrapper{"Hello", "World"};
-    EXPECT_THROW(lists::contains(list_of_ints->view(), skey), cudf::logic_error);
-    EXPECT_THROW(lists::index_of(list_of_ints->view(), skey, FIND_FIRST), cudf::logic_error);
-    EXPECT_THROW(lists::index_of(list_of_ints->view(), skey, FIND_LAST), cudf::logic_error);
+    auto skey = cudf::test::strings_column_wrapper{"Hello", "World"};
+    EXPECT_THROW(cudf::lists::contains(list_of_ints->view(), skey), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::index_of(list_of_ints->view(), skey, FIND_FIRST), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::index_of(list_of_ints->view(), skey, FIND_LAST), cudf::logic_error);
   }
   {
     // Search key column size must match lists column size.
-    auto list_of_ints = lists_column_wrapper<int32_t>{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}.release();
-    auto skey         = fixed_width_column_wrapper<int32_t>{0, 1, 2, 3};
-    EXPECT_THROW(lists::contains(list_of_ints->view(), skey), cudf::logic_error);
-    EXPECT_THROW(lists::index_of(list_of_ints->view(), skey, FIND_FIRST), cudf::logic_error);
-    EXPECT_THROW(lists::index_of(list_of_ints->view(), skey, FIND_LAST), cudf::logic_error);
+    auto list_of_ints =
+      cudf::test::lists_column_wrapper<int32_t>{{0, 1, 2}, {3, 4, 5}, {6, 7, 8}}.release();
+    auto skey = cudf::test::fixed_width_column_wrapper<int32_t>{0, 1, 2, 3};
+    EXPECT_THROW(cudf::lists::contains(list_of_ints->view(), skey), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::index_of(list_of_ints->view(), skey, FIND_FIRST), cudf::logic_error);
+    EXPECT_THROW(cudf::lists::index_of(list_of_ints->view(), skey, FIND_LAST), cudf::logic_error);
   }
 }
 
@@ -840,7 +845,7 @@ template <typename T>
 struct TypedContainsNaNsTest : public ContainsTest {
 };
 
-TYPED_TEST_SUITE(TypedContainsNaNsTest, FloatingPointTypes);
+TYPED_TEST_SUITE(TypedContainsNaNsTest, cudf::test::FloatingPointTypes);
 
 namespace {
 template <typename T>
@@ -864,7 +869,7 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsScalar)
   auto nan_2 = get_nan<T>("2");
   auto nan_3 = get_nan<T>("3");
 
-  auto search_space = lists_column_wrapper<T>{
+  auto search_space = cudf::test::lists_column_wrapper<T>{
     {0.0, 1.0, 2.0},
     {3, 4, 5},
     {6, 7, 8},
@@ -879,20 +884,20 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsScalar)
   auto search_key_nan = create_scalar_search_key<T>(nan_3);
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), *search_key_nan);
+    auto result   = cudf::lists::contains(search_space->view(), *search_key_nan);
     auto expected = bools_col{0, 0, 0, 0, 1, 0, 1, 0, 0, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result = lists::index_of(search_space->view(), *search_key_nan, FIND_FIRST);
+    auto result = cudf::lists::index_of(search_space->view(), *search_key_nan, FIND_FIRST);
     auto expected =
       indices_col{ABSENT, ABSENT, ABSENT, ABSENT, 0, ABSENT, 1, ABSENT, ABSENT, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result = lists::index_of(search_space->view(), *search_key_nan, FIND_LAST);
+    auto result = cudf::lists::index_of(search_space->view(), *search_key_nan, FIND_LAST);
     auto expected =
       indices_col{ABSENT, ABSENT, ABSENT, ABSENT, 0, ABSENT, 1, ABSENT, ABSENT, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
@@ -916,7 +921,7 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsVector)
   auto nan_2 = get_nan<T>("2");
   auto nan_3 = get_nan<T>("3");
 
-  auto search_space = lists_column_wrapper<T>{
+  auto search_space = cudf::test::lists_column_wrapper<T>{
     {0.0, 1.0, 2.0},
     {{3, 4, 5}, null_at(2)},  // i.e. {3, 4, ∅}.
     {6, 7, 8},
@@ -933,25 +938,26 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsVector)
   {
     // With nulls in the search key rows. (At index 2.)
     auto search_keys =
-      fixed_width_column_wrapper<T>{search_key_values.begin(), search_key_values.end(), null_at(2)}
+      cudf::test::fixed_width_column_wrapper<T>{
+        search_key_values.begin(), search_key_values.end(), null_at(2)}
         .release();
 
     {
       // CONTAINS
-      auto result   = lists::contains(search_space->view(), search_keys->view());
+      auto result   = cudf::lists::contains(search_space->view(), search_keys->view());
       auto expected = bools_col{{1, 0, 0, 0, 1, 0, 1, 0, 1, 0}, null_at(2)};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // FIND_FIRST
-      auto result = lists::index_of(search_space->view(), search_keys->view(), FIND_FIRST);
+      auto result = cudf::lists::index_of(search_space->view(), search_keys->view(), FIND_FIRST);
       auto expected =
         indices_col{{1, ABSENT, X, ABSENT, 0, ABSENT, 2, ABSENT, 1, ABSENT}, nulls_at({2})};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // FIND_LAST
-      auto result = lists::index_of(search_space->view(), search_keys->view(), FIND_LAST);
+      auto result = cudf::lists::index_of(search_space->view(), search_keys->view(), FIND_LAST);
       auto expected =
         indices_col{{1, ABSENT, X, ABSENT, 0, ABSENT, 2, ABSENT, 1, ABSENT}, nulls_at({2})};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
@@ -960,22 +966,23 @@ TYPED_TEST(TypedContainsNaNsTest, ListWithNaNsContainsVector)
   {
     // No nulls in the search key rows.
     auto search_keys =
-      fixed_width_column_wrapper<T>(search_key_values.begin(), search_key_values.end()).release();
+      cudf::test::fixed_width_column_wrapper<T>(search_key_values.begin(), search_key_values.end())
+        .release();
     {
       // CONTAINS
-      auto result   = lists::contains(search_space->view(), search_keys->view());
+      auto result   = cudf::lists::contains(search_space->view(), search_keys->view());
       auto expected = bools_col{1, 0, 0, 0, 1, 0, 1, 0, 1, 0};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // FIND_FIRST
-      auto result   = lists::index_of(search_space->view(), search_keys->view(), FIND_FIRST);
+      auto result   = cudf::lists::index_of(search_space->view(), search_keys->view(), FIND_FIRST);
       auto expected = indices_col{1, ABSENT, ABSENT, ABSENT, 0, ABSENT, 2, ABSENT, 1, ABSENT};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // FIND_LAST
-      auto result   = lists::index_of(search_space->view(), search_keys->view(), FIND_LAST);
+      auto result   = cudf::lists::index_of(search_space->view(), search_keys->view(), FIND_LAST);
       auto expected = indices_col{1, ABSENT, ABSENT, ABSENT, 0, ABSENT, 2, ABSENT, 1, ABSENT};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
@@ -986,7 +993,7 @@ template <typename T>
 struct TypedContainsDecimalsTest : public ContainsTest {
 };
 
-TYPED_TEST_SUITE(TypedContainsDecimalsTest, FixedPointTypes);
+TYPED_TEST_SUITE(TypedContainsDecimalsTest, cudf::test::FixedPointTypes);
 
 TYPED_TEST(TypedContainsDecimalsTest, ScalarKey)
 {
@@ -995,29 +1002,30 @@ TYPED_TEST(TypedContainsDecimalsTest, ScalarKey)
   auto const search_space = [] {
     auto const values = std::vector<typename T::rep>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
                                                      2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3};
-    auto decimals     = fixed_point_column_wrapper<typename T::rep>{
+    auto decimals     = cudf::test::fixed_point_column_wrapper<typename T::rep>{
       values.begin(), values.end(), numeric::scale_type{0}};
     auto list_offsets = indices_col{0, 3, 6, 9, 12, 15, 18, 21, 21, 24, 24};
-    return make_lists_column(10, list_offsets.release(), decimals.release(), 0, {});
+    return cudf::make_lists_column(10, list_offsets.release(), decimals.release(), 0, {});
   }();
-  auto search_key_one = make_fixed_point_scalar<T>(typename T::rep{1}, numeric::scale_type{0});
+  auto search_key_one =
+    cudf::make_fixed_point_scalar<T>(typename T::rep{1}, numeric::scale_type{0});
 
   // Search space: [[0,1,2], [3,4,5], [6,7,8], [9,0,1], [2,3,4], [5,6,7], [8,9,0], [], [1,2,3], []]
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), *search_key_one);
+    auto result   = cudf::lists::contains(search_space->view(), *search_key_one);
     auto expected = bools_col{1, 0, 0, 1, 0, 0, 0, 0, 1, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_one, FIND_FIRST);
     auto expected = indices_col{1, ABSENT, ABSENT, 2, ABSENT, ABSENT, ABSENT, ABSENT, 0, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), *search_key_one, FIND_LAST);
     auto expected = indices_col{1, ABSENT, ABSENT, 2, ABSENT, ABSENT, ABSENT, ABSENT, 0, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -1030,13 +1038,13 @@ TYPED_TEST(TypedContainsDecimalsTest, VectorKey)
   auto const search_space = [] {
     auto const values = std::vector<typename T::rep>{0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1,
                                                      2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3};
-    auto decimals     = fixed_point_column_wrapper<typename T::rep>{
+    auto decimals     = cudf::test::fixed_point_column_wrapper<typename T::rep>{
       values.begin(), values.end(), numeric::scale_type{0}};
     auto list_offsets = indices_col{0, 3, 6, 9, 12, 15, 18, 21, 21, 24, 24};
-    return make_lists_column(10, list_offsets.release(), decimals.release(), 0, {});
+    return cudf::make_lists_column(10, list_offsets.release(), decimals.release(), 0, {});
   }();
 
-  auto search_key = fixed_point_column_wrapper<typename T::rep>{
+  auto search_key = cudf::test::fixed_point_column_wrapper<typename T::rep>{
     {1, 2, 3, 1, 2, 3, 1, 2, 3, 1},
     numeric::scale_type{
       0}}.release();
@@ -1045,19 +1053,19 @@ TYPED_TEST(TypedContainsDecimalsTest, VectorKey)
   // ] Search keys:  [  1,       2,       3,       1,       2,       3,       1,       2,  3, 1 ]
   {
     // CONTAINS
-    auto result   = lists::contains(search_space->view(), search_key->view());
+    auto result   = cudf::lists::contains(search_space->view(), search_key->view());
     auto expected = bools_col{1, 0, 0, 1, 1, 0, 0, 0, 1, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto result   = lists::index_of(search_space->view(), search_key->view(), FIND_FIRST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_key->view(), FIND_FIRST);
     auto expected = indices_col{1, ABSENT, ABSENT, 2, 0, ABSENT, ABSENT, ABSENT, 2, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto result   = lists::index_of(search_space->view(), search_key->view(), FIND_LAST);
+    auto result   = cudf::lists::index_of(search_space->view(), search_key->view(), FIND_LAST);
     auto expected = indices_col{1, ABSENT, ABSENT, 2, 0, ABSENT, ABSENT, ABSENT, 2, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -1075,8 +1083,8 @@ TYPED_TEST(TypedStructContainsTest, EmptyInputTest)
   auto const lists = [] {
     auto offsets = indices_col{};
     auto data    = tdata_col{};
-    auto child   = structs_col{{data}};
-    return make_lists_column(0, offsets.release(), child.release(), 0, {});
+    auto child   = cudf::test::structs_column_wrapper{{data}};
+    return cudf::make_lists_column(0, offsets.release(), child.release(), 0, {});
   }();
 
   auto const scalar_key = [] {
@@ -1085,11 +1093,11 @@ TYPED_TEST(TypedStructContainsTest, EmptyInputTest)
   }();
   auto const column_key = [] {
     auto child = tdata_col{};
-    return structs_col{{child}};
+    return cudf::test::structs_column_wrapper{{child}};
   }();
 
-  auto const result1  = lists::contains(lists->view(), scalar_key);
-  auto const result2  = lists::contains(lists->view(), column_key);
+  auto const result1  = cudf::lists::contains(lists->view(), scalar_key);
+  auto const result2  = cudf::lists::contains(lists->view(), column_key);
   auto const expected = bools_col{};
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result1);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result2);
@@ -1121,8 +1129,8 @@ TYPED_TEST(TypedStructContainsTest, ScalarKeyNoNullLists)
                               1, 0, 1, 1
     };
     // clang-format on
-    auto child = structs_col{{data1, data2}};
-    return make_lists_column(10, offsets.release(), child.release(), 0, {});
+    auto child = cudf::test::structs_column_wrapper{{data1, data2}};
+    return cudf::make_lists_column(10, offsets.release(), child.release(), 0, {});
   }();
 
   auto const key = [] {
@@ -1133,26 +1141,26 @@ TYPED_TEST(TypedStructContainsTest, ScalarKeyNoNullLists)
 
   {
     // CONTAINS
-    auto const result   = lists::contains(lists->view(), key);
+    auto const result   = cudf::lists::contains(lists->view(), key);
     auto const expected = bools_col{1, 0, 0, 0, 0, 0, 0, 0, 1, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // CONTAINS NULLS
-    auto const result   = lists::contains_nulls(lists->view());
+    auto const result   = cudf::lists::contains_nulls(lists->view());
     auto const expected = bools_col{0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto const result = lists::index_of(lists->view(), key, FIND_FIRST);
+    auto const result = cudf::lists::index_of(lists->view(), key, FIND_FIRST);
     auto const expected =
       indices_col{1, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 0, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto const result = lists::index_of(lists->view(), key, FIND_LAST);
+    auto const result = cudf::lists::index_of(lists->view(), key, FIND_LAST);
     auto const expected =
       indices_col{1, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 2, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
@@ -1185,13 +1193,14 @@ TYPED_TEST(TypedStructContainsTest, ScalarKeyWithNullLists)
                               1, 0, 1, 1
     };
     // clang-format on
-    auto child               = structs_col{{data1, data2}};
+    auto child               = cudf::test::structs_column_wrapper{{data1, data2}};
     auto const validity_iter = nulls_at({3, 10});
-    return make_lists_column(11,
-                             offsets.release(),
-                             child.release(),
-                             2,
-                             detail::make_null_mask(validity_iter, validity_iter + 11));
+    return cudf::make_lists_column(
+      11,
+      offsets.release(),
+      child.release(),
+      2,
+      cudf::test::detail::make_null_mask(validity_iter, validity_iter + 11));
   }();
 
   auto const key = [] {
@@ -1202,26 +1211,26 @@ TYPED_TEST(TypedStructContainsTest, ScalarKeyWithNullLists)
 
   {
     // CONTAINS
-    auto const result   = lists::contains(lists->view(), key);
+    auto const result   = cudf::lists::contains(lists->view(), key);
     auto const expected = bools_col{{1, 0, 0, X, 0, 0, 0, 0, 0, 1, X}, nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // CONTAINS NULLS
-    auto const result   = lists::contains_nulls(lists->view());
+    auto const result   = cudf::lists::contains_nulls(lists->view());
     auto const expected = bools_col{{0, 0, 0, X, 0, 0, 0, 0, 0, 0, X}, nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto const result   = lists::index_of(lists->view(), key, FIND_FIRST);
+    auto const result   = cudf::lists::index_of(lists->view(), key, FIND_FIRST);
     auto const expected = indices_col{
       {1, ABSENT, ABSENT, X, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 0, X}, nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto const result   = lists::index_of(lists->view(), key, FIND_LAST);
+    auto const result   = cudf::lists::index_of(lists->view(), key, FIND_LAST);
     auto const expected = indices_col{
       {1, ABSENT, ABSENT, X, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 2, X}, nulls_at({3, 10})};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
@@ -1254,8 +1263,8 @@ TYPED_TEST(TypedStructContainsTest, SlicedListsColumnNoNulls)
                               1, 0, 1, 1
     };
     // clang-format on
-    auto child = structs_col{{data1, data2}};
-    return make_lists_column(10, offsets.release(), child.release(), 0, {});
+    auto child = cudf::test::structs_column_wrapper{{data1, data2}};
+    return cudf::make_lists_column(10, offsets.release(), child.release(), 0, {});
   }();
   auto const lists = cudf::slice(lists_original->view(), {3, 10})[0];
 
@@ -1267,25 +1276,25 @@ TYPED_TEST(TypedStructContainsTest, SlicedListsColumnNoNulls)
 
   {
     // CONTAINS
-    auto const result   = lists::contains(lists, key);
+    auto const result   = cudf::lists::contains(lists, key);
     auto const expected = bools_col{0, 0, 0, 0, 0, 1, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // CONTAINS NULLS
-    auto const result   = lists::contains_nulls(lists);
+    auto const result   = cudf::lists::contains_nulls(lists);
     auto const expected = bools_col{0, 0, 0, 0, 0, 0, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto const result   = lists::index_of(lists, key, FIND_FIRST);
+    auto const result   = cudf::lists::index_of(lists, key, FIND_FIRST);
     auto const expected = indices_col{ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 0, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto const result   = lists::index_of(lists, key, FIND_LAST);
+    auto const result   = cudf::lists::index_of(lists, key, FIND_LAST);
     auto const expected = indices_col{ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 2, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -1317,8 +1326,8 @@ TYPED_TEST(TypedStructContainsTest, ScalarKeyNoNullListsWithNullStructs)
                               X, 0, 1, 1
     };
     // clang-format on
-    auto child = structs_col{{data1, data2}, nulls_at({1, 10, 15, 24})};
-    return make_lists_column(10, offsets.release(), child.release(), 0, {});
+    auto child = cudf::test::structs_column_wrapper{{data1, data2}, nulls_at({1, 10, 15, 24})};
+    return cudf::make_lists_column(10, offsets.release(), child.release(), 0, {});
   }();
 
   auto const key = [] {
@@ -1329,26 +1338,26 @@ TYPED_TEST(TypedStructContainsTest, ScalarKeyNoNullListsWithNullStructs)
 
   {
     // CONTAINS
-    auto const result   = lists::contains(lists->view(), key);
+    auto const result   = cudf::lists::contains(lists->view(), key);
     auto const expected = bools_col{1, 0, 0, 0, 0, 0, 0, 0, 1, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // CONTAINS NULLS
-    auto const result   = lists::contains_nulls(lists->view());
+    auto const result   = cudf::lists::contains_nulls(lists->view());
     auto const expected = bools_col{1, 0, 0, 1, 1, 0, 0, 0, 1, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto const result = lists::index_of(lists->view(), key, FIND_FIRST);
+    auto const result = cudf::lists::index_of(lists->view(), key, FIND_FIRST);
     auto const expected =
       indices_col{3, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 2, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto const result = lists::index_of(lists->view(), key, FIND_LAST);
+    auto const result = cudf::lists::index_of(lists->view(), key, FIND_LAST);
     auto const expected =
       indices_col{3, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 2, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
@@ -1381,32 +1390,32 @@ TYPED_TEST(TypedStructContainsTest, ColumnKeyNoNullLists)
                               1, 0, 1, 1
     };
     // clang-format on
-    auto child = structs_col{{data1, data2}};
-    return make_lists_column(10, offsets.release(), child.release(), 0, {});
+    auto child = cudf::test::structs_column_wrapper{{data1, data2}};
+    return cudf::make_lists_column(10, offsets.release(), child.release(), 0, {});
   }();
 
   auto const keys = [] {
     auto child1 = tdata_col{1, 3, 1, 1, 2, 1, 0, 0, 1, 0};
     auto child2 = tdata_col{1, 0, 1, 1, 2, 1, 0, 0, 1, 0};
-    return structs_col{{child1, child2}};
+    return cudf::test::structs_column_wrapper{{child1, child2}};
   }();
 
   {
     // CONTAINS
-    auto const result   = lists::contains(lists->view(), keys);
+    auto const result   = cudf::lists::contains(lists->view(), keys);
     auto const expected = bools_col{1, 1, 0, 0, 0, 0, 0, 0, 1, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto const result = lists::index_of(lists->view(), keys, FIND_FIRST);
+    auto const result = cudf::lists::index_of(lists->view(), keys, FIND_FIRST);
     auto const expected =
       indices_col{1, 0, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 0, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto const result = lists::index_of(lists->view(), keys, FIND_LAST);
+    auto const result = cudf::lists::index_of(lists->view(), keys, FIND_LAST);
     auto const expected =
       indices_col{1, 2, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, ABSENT, 2, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
@@ -1439,14 +1448,14 @@ TYPED_TEST(TypedStructContainsTest, ColumnKeyWithSlicedListsNoNulls)
                               1, 0, 1, 1
     };
     // clang-format on
-    auto child = structs_col{{data1, data2}};
-    return make_lists_column(10, offsets.release(), child.release(), 0, {});
+    auto child = cudf::test::structs_column_wrapper{{data1, data2}};
+    return cudf::make_lists_column(10, offsets.release(), child.release(), 0, {});
   }();
 
   auto const keys_original = [] {
     auto child1 = tdata_col{1, 9, 1, 6, 2, 1, 0, 0, 1, 0};
     auto child2 = tdata_col{1, 1, 1, 1, 2, 1, 0, 0, 1, 0};
-    return structs_col{{child1, child2}};
+    return cudf::test::structs_column_wrapper{{child1, child2}};
   }();
 
   auto const lists = cudf::slice(lists_original->view(), {3, 7})[0];
@@ -1454,19 +1463,19 @@ TYPED_TEST(TypedStructContainsTest, ColumnKeyWithSlicedListsNoNulls)
 
   {
     // CONTAINS
-    auto const result   = lists::contains(lists, keys);
+    auto const result   = cudf::lists::contains(lists, keys);
     auto const expected = bools_col{1, 0, 1, 0};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto const result   = lists::index_of(lists, keys, FIND_FIRST);
+    auto const result   = cudf::lists::index_of(lists, keys, FIND_FIRST);
     auto const expected = indices_col{0, ABSENT, 1, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto const result   = lists::index_of(lists, keys, FIND_LAST);
+    auto const result   = cudf::lists::index_of(lists, keys, FIND_LAST);
     auto const expected = indices_col{0, ABSENT, 1, ABSENT};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -1498,19 +1507,20 @@ TYPED_TEST(TypedStructContainsTest, ColumnKeyWithSlicedListsHavingNulls)
                               X, 0, 1, 1
     };
     // clang-format on
-    auto child               = structs_col{{data1, data2}, nulls_at({1, 10, 15, 24})};
+    auto child = cudf::test::structs_column_wrapper{{data1, data2}, nulls_at({1, 10, 15, 24})};
     auto const validity_iter = nulls_at({3, 10});
-    return make_lists_column(11,
-                             offsets.release(),
-                             child.release(),
-                             2,
-                             detail::make_null_mask(validity_iter, validity_iter + 11));
+    return cudf::make_lists_column(
+      11,
+      offsets.release(),
+      child.release(),
+      2,
+      cudf::test::detail::make_null_mask(validity_iter, validity_iter + 11));
   }();
 
   auto const keys_original = [] {
     auto child1 = tdata_col{{1, X, 1, 6, X, 1, 0, 0, 1, 0, 1}, null_at(4)};
     auto child2 = tdata_col{{1, X, 1, 1, X, 1, 0, 0, 1, 0, 1}, null_at(4)};
-    return structs_col{{child1, child2}, null_at(1)};
+    return cudf::test::structs_column_wrapper{{child1, child2}, null_at(1)};
   }();
 
   auto const lists = cudf::slice(lists_original->view(), {4, 8})[0];
@@ -1518,19 +1528,19 @@ TYPED_TEST(TypedStructContainsTest, ColumnKeyWithSlicedListsHavingNulls)
 
   {
     // CONTAINS
-    auto const result   = lists::contains(lists, keys);
+    auto const result   = cudf::lists::contains(lists, keys);
     auto const expected = bools_col{{X, 0, 1, 0}, null_at(0)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto const result   = lists::index_of(lists, keys, FIND_FIRST);
+    auto const result   = cudf::lists::index_of(lists, keys, FIND_FIRST);
     auto const expected = indices_col{{X, ABSENT, 1, ABSENT}, null_at(0)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto const result   = lists::index_of(lists, keys, FIND_LAST);
+    auto const result   = cudf::lists::index_of(lists, keys, FIND_LAST);
     auto const expected = indices_col{{X, ABSENT, 2, ABSENT}, null_at(0)};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
@@ -1572,31 +1582,31 @@ TYPED_TEST(TypedListContainsTest, ScalarKeyLists)
 
   auto const key = [] {
     auto const child = tdata_col{0, 1, 2};
-    return list_scalar(child);
+    return cudf::list_scalar(child);
   }();
 
   auto const do_test = [&](auto const& lists, bool has_nulls) {
     {
       // CONTAINS
-      auto const result   = lists::contains(lists_column_view{lists}, key);
+      auto const result   = cudf::lists::contains(cudf::lists_column_view{lists}, key);
       auto const expected = bools_col{1, 0, 0};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // CONTAINS NULLS
-      auto const result   = lists::contains_nulls(lists_column_view{lists});
+      auto const result   = cudf::lists::contains_nulls(cudf::lists_column_view{lists});
       auto const expected = has_nulls ? bools_col{1, 1, 0} : bools_col{0, 0, 0};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // FIND_FIRST
-      auto const result   = lists::index_of(lists_column_view{lists}, key, FIND_FIRST);
+      auto const result   = cudf::lists::index_of(cudf::lists_column_view{lists}, key, FIND_FIRST);
       auto const expected = indices_col{0, ABSENT, ABSENT};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // FIND_LAST
-      auto const result   = lists::index_of(lists_column_view{lists}, key, FIND_LAST);
+      auto const result   = cudf::lists::index_of(cudf::lists_column_view{lists}, key, FIND_LAST);
       auto const expected = indices_col{2, ABSENT, ABSENT};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
@@ -1664,31 +1674,31 @@ TYPED_TEST(TypedListContainsTest, SlicedListsColumn)
 
   auto const key = [] {
     auto const child = tdata_col{0, 1, 2};
-    return list_scalar(child);
+    return cudf::list_scalar(child);
   }();
 
   auto const do_test = [&](auto const& lists, bool has_nulls) {
     {
       // CONTAINS
-      auto const result   = lists::contains(lists_column_view{lists}, key);
+      auto const result   = cudf::lists::contains(cudf::lists_column_view{lists}, key);
       auto const expected = bools_col{1, 0, 0};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // CONTAINS NULLS
-      auto const result   = lists::contains_nulls(lists_column_view{lists});
+      auto const result   = cudf::lists::contains_nulls(cudf::lists_column_view{lists});
       auto const expected = has_nulls ? bools_col{1, 1, 0} : bools_col{0, 0, 0};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // FIND_FIRST
-      auto const result   = lists::index_of(lists_column_view{lists}, key, FIND_FIRST);
+      auto const result   = cudf::lists::index_of(cudf::lists_column_view{lists}, key, FIND_FIRST);
       auto const expected = indices_col{0, ABSENT, ABSENT};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // FIND_LAST
-      auto const result   = lists::index_of(lists_column_view{lists}, key, FIND_LAST);
+      auto const result   = cudf::lists::index_of(cudf::lists_column_view{lists}, key, FIND_LAST);
       auto const expected = indices_col{2, ABSENT, ABSENT};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
@@ -1737,25 +1747,25 @@ TYPED_TEST(TypedListContainsTest, ColumnKeyLists)
   auto const do_test = [&](auto const& lists, bool has_nulls) {
     {
       // CONTAINS
-      auto const result   = lists::contains(lists_column_view{lists}, key);
+      auto const result   = cudf::lists::contains(cudf::lists_column_view{lists}, key);
       auto const expected = has_nulls ? bools_col{1, 1, 1} : bools_col{0, 1, 0};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // CONTAINS NULLS
-      auto const result   = lists::contains_nulls(lists_column_view{lists});
+      auto const result   = cudf::lists::contains_nulls(cudf::lists_column_view{lists});
       auto const expected = has_nulls ? bools_col{1, 1, 0} : bools_col{0, 0, 0};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // FIND_FIRST
-      auto const result   = lists::index_of(lists_column_view{lists}, key, FIND_FIRST);
+      auto const result   = cudf::lists::index_of(cudf::lists_column_view{lists}, key, FIND_FIRST);
       auto const expected = has_nulls ? indices_col{0, 2, 1} : indices_col{ABSENT, 0, ABSENT};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
     {
       // FIND_LAST
-      auto const result   = lists::index_of(lists_column_view{lists}, key, FIND_LAST);
+      auto const result   = cudf::lists::index_of(cudf::lists_column_view{lists}, key, FIND_LAST);
       auto const expected = has_nulls ? indices_col{2, 2, 1} : indices_col{ABSENT, 2, ABSENT};
       CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
     }
@@ -1791,40 +1801,38 @@ TYPED_TEST(TypedListContainsTest, ColumnKeyWithListsOfStructsNoNulls)
 
     };
     // clang-format on
-    auto structs = structs_col{{data1, data2}};
-    auto child   = make_lists_column(8, child_offsets.release(), structs.release(), 0, {});
+    auto structs = cudf::test::structs_column_wrapper{{data1, data2}};
+    auto child   = cudf::make_lists_column(8, child_offsets.release(), structs.release(), 0, {});
 
     auto offsets = indices_col{0, 4, 8};
-    return make_lists_column(2, offsets.release(), std::move(child), 0, {});
+    return cudf::make_lists_column(2, offsets.release(), std::move(child), 0, {});
   }();
 
   auto const key = [] {
     auto data1       = tdata_col{0, 0, 2};
     auto data2       = tdata_col{10, 10, 12};
-    auto const child = structs_col{{data1, data2}};
-    return list_scalar(child);
+    auto const child = cudf::test::structs_column_wrapper{{data1, data2}};
+    return cudf::list_scalar(child);
   }();
 
   {
     // CONTAINS
-    auto const result   = lists::contains(lists_column_view{lists->view()}, key);
+    auto const result   = cudf::lists::contains(cudf::lists_column_view{lists->view()}, key);
     auto const expected = bools_col{1, 1};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_FIRST
-    auto const result   = lists::index_of(lists_column_view{lists->view()}, key, FIND_FIRST);
+    auto const result =
+      cudf::lists::index_of(cudf::lists_column_view{lists->view()}, key, FIND_FIRST);
     auto const expected = indices_col{0, 1};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
   {
     // FIND_LAST
-    auto const result   = lists::index_of(lists_column_view{lists->view()}, key, FIND_LAST);
+    auto const result =
+      cudf::lists::index_of(cudf::lists_column_view{lists->view()}, key, FIND_LAST);
     auto const expected = indices_col{2, 1};
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected, *result);
   }
 }
-
-}  // namespace test
-
-}  // namespace cudf
diff --git a/cpp/tests/lists/explode_tests.cpp b/cpp/tests/lists/explode_tests.cpp
index fd22932916f..1a20a88df96 100644
--- a/cpp/tests/lists/explode_tests.cpp
+++ b/cpp/tests/lists/explode_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -22,9 +22,8 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/lists/explode.hpp>
 
-using namespace cudf::test;
-using FCW = fixed_width_column_wrapper<int32_t>;
-using LCW = lists_column_wrapper<int32_t>;
+using FCW = cudf::test::fixed_width_column_wrapper<int32_t>;
+using LCW = cudf::test::lists_column_wrapper<int32_t>;
 
 class ExplodeTest : public cudf::test::BaseFixture {
 };
@@ -78,11 +77,11 @@ TEST_F(ExplodeTest, Basics)
 
   FCW a{100, 200, 300};
   LCW b{LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}};
-  strings_column_wrapper c{"string0", "string1", "string2"};
+  cudf::test::strings_column_wrapper c{"string0", "string1", "string2"};
 
   FCW expected_a{100, 100, 100, 200, 200, 300, 300};
   FCW expected_b{1, 2, 7, 5, 6, 0, 3};
-  strings_column_wrapper expected_c{
+  cudf::test::strings_column_wrapper expected_c{
     "string0", "string0", "string0", "string1", "string1", "string2", "string2"};
 
   cudf::table_view t({a, b, c});
@@ -347,14 +346,14 @@ TEST_F(ExplodeTest, NestedStructs)
          LCW{LCW{5, 6}},
          LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}});
   FCW b1({100, 200, 300});
-  strings_column_wrapper b2{"100", "200", "300"};
-  structs_column_wrapper b({b1, b2});
+  cudf::test::strings_column_wrapper b2{"100", "200", "300"};
+  cudf::test::structs_column_wrapper b({b1, b2});
 
   LCW expected_a{
     LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)};
   FCW expected_b1{100, 100, 200, 300, 300, 300};
-  strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
-  structs_column_wrapper expected_b({expected_b1, expected_b2});
+  cudf::test::strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
+  cudf::test::structs_column_wrapper expected_b({expected_b1, expected_b2});
 
   cudf::table_view t({a, b});
   cudf::table_view expected({expected_a, expected_b});
@@ -435,15 +434,16 @@ TEST_F(ExplodeTest, ListOfStructsWithEmpties)
   // concatenated
   auto final_col =
     cudf::concatenate(std::vector<cudf::column_view>({*row0, *row1, *row2, *row3, *row4}));
-  auto s = strings_column_wrapper({"a", "b", "c", "d", "e"}).release();
+  auto s = cudf::test::strings_column_wrapper({"a", "b", "c", "d", "e"}).release();
 
   cudf::table_view t({final_col->view(), s->view()});
 
-  auto ret                  = cudf::explode(t, 0);
-  auto expected_numeric_col = fixed_width_column_wrapper<int32_t>{{1, null, null}, {1, 0, 0}};
+  auto ret = cudf::explode(t, 0);
+  auto expected_numeric_col =
+    cudf::test::fixed_width_column_wrapper<int32_t>{{1, null, null}, {1, 0, 0}};
 
-  auto expected_a = structs_column_wrapper{{expected_numeric_col}, {1, 1, 0}}.release();
-  auto expected_b = strings_column_wrapper({"a", "b", "c"}).release();
+  auto expected_a = cudf::test::structs_column_wrapper{{expected_numeric_col}, {1, 1, 0}}.release();
+  auto expected_b = cudf::test::strings_column_wrapper({"a", "b", "c"}).release();
 
   cudf::table_view expected({expected_a->view(), expected_b->view()});
 
@@ -464,10 +464,11 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs)
   //  [{25, "25"}, {30, "30"}] 400
   //  [{15, "15"}, {20, "20"}] 500
 
-  auto numeric_col =
-    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
-  strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
-  auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release();
+  auto numeric_col = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>{
+    {70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  cudf::test::strings_column_wrapper string_col{
+    "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+  auto struct_col = cudf::test::structs_column_wrapper{{numeric_col, string_col}}.release();
   auto a          = cudf::make_lists_column(
     5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), cudf::UNKNOWN_NULL_COUNT, {});
 
@@ -476,12 +477,13 @@ TYPED_TEST(ExplodeTypedTest, ListOfStructs)
   cudf::table_view t({a->view(), b});
   auto ret = cudf::explode(t, 0);
 
-  auto expected_numeric_col =
-    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
-  strings_column_wrapper expected_string_col{
+  auto expected_numeric_col = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>{
+    {70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  cudf::test::strings_column_wrapper expected_string_col{
     "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
 
-  auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
+  auto expected_a =
+    cudf::test::structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
   FCW expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500};
 
   cudf::table_view expected({expected_a->view(), expected_b});
@@ -570,11 +572,11 @@ TEST_F(ExplodeOuterTest, Basics)
 
   FCW a{100, 200, 300};
   LCW b{LCW{1, 2, 7}, LCW{5, 6}, LCW{0, 3}};
-  strings_column_wrapper c{"string0", "string1", "string2"};
+  cudf::test::strings_column_wrapper c{"string0", "string1", "string2"};
 
   FCW expected_a{100, 100, 100, 200, 200, 300, 300};
   FCW expected_b{1, 2, 7, 5, 6, 0, 3};
-  strings_column_wrapper expected_c{
+  cudf::test::strings_column_wrapper expected_c{
     "string0", "string0", "string0", "string1", "string1", "string2", "string2"};
 
   cudf::table_view t({a, b, c});
@@ -992,14 +994,14 @@ TEST_F(ExplodeOuterTest, NestedStructs)
          LCW{LCW{5, 6}},
          LCW{LCW{0, 3}, LCW{5}, LCW({2, null}, valids)}});
   FCW b1({100, 200, 300});
-  strings_column_wrapper b2{"100", "200", "300"};
-  structs_column_wrapper b({b1, b2});
+  cudf::test::strings_column_wrapper b2{"100", "200", "300"};
+  cudf::test::structs_column_wrapper b({b1, b2});
 
   LCW expected_a{
     LCW({1, null}, valids), LCW{7, 6, 5}, LCW{5, 6}, LCW{0, 3}, LCW{5}, LCW({2, null}, valids)};
   FCW expected_b1{100, 100, 200, 300, 300, 300};
-  strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
-  structs_column_wrapper expected_b({expected_b1, expected_b2});
+  cudf::test::strings_column_wrapper expected_b2{"100", "100", "200", "300", "300", "300"};
+  cudf::test::structs_column_wrapper expected_b({expected_b1, expected_b2});
 
   cudf::table_view t({a, b});
   cudf::table_view expected({expected_a, expected_b});
@@ -1080,17 +1082,18 @@ TEST_F(ExplodeOuterTest, ListOfStructsWithEmpties)
   // concatenated
   auto final_col =
     cudf::concatenate(std::vector<cudf::column_view>({*row0, *row1, *row2, *row3, *row4}));
-  auto s = strings_column_wrapper({"a", "b", "c", "d", "e"}).release();
+  auto s = cudf::test::strings_column_wrapper({"a", "b", "c", "d", "e"}).release();
 
   cudf::table_view t({final_col->view(), s->view()});
 
   auto ret = cudf::explode_outer(t, 0);
 
   auto expected_numeric_col =
-    fixed_width_column_wrapper<int32_t>{{1, null, null, null, null}, {1, 0, 0, 0, 0}};
+    cudf::test::fixed_width_column_wrapper<int32_t>{{1, null, null, null, null}, {1, 0, 0, 0, 0}};
 
-  auto expected_a = structs_column_wrapper{{expected_numeric_col}, {1, 1, 0, 0, 0}}.release();
-  auto expected_b = strings_column_wrapper({"a", "b", "c", "d", "e"}).release();
+  auto expected_a =
+    cudf::test::structs_column_wrapper{{expected_numeric_col}, {1, 1, 0, 0, 0}}.release();
+  auto expected_b = cudf::test::strings_column_wrapper({"a", "b", "c", "d", "e"}).release();
 
   cudf::table_view expected({expected_a->view(), expected_b->view()});
 
@@ -1111,10 +1114,11 @@ TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs)
   //  [{25, "25"}, {30, "30"}] 400
   //  [{15, "15"}, {20, "20"}] 500
 
-  auto numeric_col =
-    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
-  strings_column_wrapper string_col{"70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
-  auto struct_col = structs_column_wrapper{{numeric_col, string_col}}.release();
+  auto numeric_col = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>{
+    {70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  cudf::test::strings_column_wrapper string_col{
+    "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
+  auto struct_col = cudf::test::structs_column_wrapper{{numeric_col, string_col}}.release();
   auto a          = cudf::make_lists_column(
     5, FCW{0, 2, 4, 6, 8, 10}.release(), std::move(struct_col), cudf::UNKNOWN_NULL_COUNT, {});
 
@@ -1123,12 +1127,13 @@ TYPED_TEST(ExplodeOuterTypedTest, ListOfStructs)
   cudf::table_view t({a->view(), b});
   auto ret = cudf::explode_outer(t, 0);
 
-  auto expected_numeric_col =
-    fixed_width_column_wrapper<TypeParam, int32_t>{{70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
-  strings_column_wrapper expected_string_col{
+  auto expected_numeric_col = cudf::test::fixed_width_column_wrapper<TypeParam, int32_t>{
+    {70, 75, 50, 55, 35, 45, 25, 30, 15, 20}};
+  cudf::test::strings_column_wrapper expected_string_col{
     "70", "75", "50", "55", "35", "45", "25", "30", "15", "20"};
 
-  auto expected_a = structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
+  auto expected_a =
+    cudf::test::structs_column_wrapper{{expected_numeric_col, expected_string_col}}.release();
   FCW expected_b{100, 100, 200, 200, 300, 300, 400, 400, 500, 500};
 
   cudf::table_view expected({expected_a->view(), expected_b});
diff --git a/cpp/tests/lists/extract_tests.cpp b/cpp/tests/lists/extract_tests.cpp
index 210a5814ede..916dd121253 100644
--- a/cpp/tests/lists/extract_tests.cpp
+++ b/cpp/tests/lists/extract_tests.cpp
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/detail/null_mask.hpp>
-#include <cudf/lists/extract.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/detail/null_mask.hpp>
+#include <cudf/lists/extract.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 
@@ -229,7 +228,8 @@ TYPED_TEST(ListsExtractNumericsTest, ExtractElementsFromNonCompactedNullLists)
       .release();
 
   // Set null at index 4.
-  cudf::detail::set_null_mask(input->mutable_view().null_mask(), 4, 5, false);
+  cudf::detail::set_null_mask(
+    input->mutable_view().null_mask(), 4, 5, false, cudf::get_default_stream());
 
   {
     auto result   = cudf::lists::extract_list_element(cudf::lists_column_view{*input}, 0);
diff --git a/cpp/tests/lists/sort_lists_tests.cpp b/cpp/tests/lists/sort_lists_tests.cpp
index a26ae5c2f48..22c3ba581ea 100644
--- a/cpp/tests/lists/sort_lists_tests.cpp
+++ b/cpp/tests/lists/sort_lists_tests.cpp
@@ -260,12 +260,12 @@ TEST_F(SortListsDouble, InfinityAndNaN)
   {
     // clang-format off
     LCW input{0.0, -0.0, -NaN, -NaN, NaN, Inf, -Inf,
-               1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 
                1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
                1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
                1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
                1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
-               1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 
+               1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
+               1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
                1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
                1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
                1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0,
@@ -277,13 +277,13 @@ TEST_F(SortListsDouble, InfinityAndNaN)
                2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
                3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
                4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
-               5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 
+               5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
                6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
                7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, 7,
                8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
                9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9, 9,
               Inf, Inf, -NaN, -NaN, NaN, NaN, -NaN, -NaN};
-    // clang-format on          
+    // clang-format on
     auto [sorted_lists, stable_sorted_lists] =
       generate_sorted_lists(cudf::lists_column_view{input}, {}, {});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(sorted_lists->view(), expected);
diff --git a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
index a5b036210ba..17265326fde 100644
--- a/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/lists/stream_compaction/apply_boolean_mask_tests.cpp
@@ -217,17 +217,15 @@ TEST_F(ApplyBooleanMaskTest, Failure)
     // Invalid mask type.
     auto const input  = lists<int32_t>{{1, 2, 3}, {4, 5, 6}};
     auto const filter = lists<int32_t>{{0, 0, 0}};
-    CUDF_EXPECT_THROW_MESSAGE(
-      apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}),
-      "Mask must be of type BOOL8.");
+    EXPECT_THROW(apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}),
+                 cudf::logic_error);
   }
   {
     // Mismatched number of rows.
     auto const input  = lists<int32_t>{{1, 2, 3}, {4, 5, 6}};
     auto const filter = filter_t{{0, 0, 0}};
-    CUDF_EXPECT_THROW_MESSAGE(
-      apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}),
-      "Boolean masks column must have same number of rows as input.");
+    EXPECT_THROW(apply_boolean_mask(lists_column_view{input}, lists_column_view{filter}),
+                 cudf::logic_error);
   }
 }
 }  // namespace cudf::test
diff --git a/cpp/tests/partitioning/hash_partition_test.cpp b/cpp/tests/partitioning/hash_partition_test.cpp
index 1addbca945b..abf4095e4ec 100644
--- a/cpp/tests/partitioning/hash_partition_test.cpp
+++ b/cpp/tests/partitioning/hash_partition_test.cpp
@@ -303,7 +303,8 @@ void run_fixed_width_test(size_t cols,
 
   // Make a table view of the partition numbers
   constexpr cudf::data_type dtype{cudf::type_id::INT32};
-  auto d_partitions = cudf::detail::make_device_uvector_sync(partitions);
+  auto d_partitions =
+    cudf::detail::make_device_uvector_sync(partitions, cudf::get_default_stream());
   cudf::column_view partitions_col(dtype, rows, d_partitions.data());
   cudf::table_view partitions_table({partitions_col});
 
diff --git a/cpp/tests/quantiles/percentile_approx_test.cu b/cpp/tests/quantiles/percentile_approx_test.cpp
similarity index 68%
rename from cpp/tests/quantiles/percentile_approx_test.cu
rename to cpp/tests/quantiles/percentile_approx_test.cpp
index 0ca63526c51..2840d275d4d 100644
--- a/cpp/tests/quantiles/percentile_approx_test.cu
+++ b/cpp/tests/quantiles/percentile_approx_test.cpp
@@ -13,56 +13,47 @@
  * See the License for the specific language governing permissions and
  * limitations under the License.
  */
-#include <arrow/util/tdigest.h>
+
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/tdigest_utilities.cuh>
+#include <cudf_test/type_list_utilities.hpp>
+#include <cudf_test/type_lists.hpp>
 
 #include <cudf/detail/tdigest/tdigest.hpp>
-#include <cudf/detail/valid_if.cuh>
 #include <cudf/groupby.hpp>
 #include <cudf/quantiles.hpp>
 #include <cudf/reduction.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/sorting.hpp>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/transform.hpp>
-#include <cudf/unary.hpp>
 #include <cudf/utilities/default_stream.hpp>
+#include <cudf/utilities/error.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/type_list_utilities.hpp>
-#include <cudf_test/type_lists.hpp>
-
-#include <rmm/exec_policy.hpp>
-
-#include <tests/groupby/groupby_test_util.hpp>
-
-#include <thrust/fill.h>
-#include <thrust/iterator/counting_iterator.h>
-#include <thrust/transform.h>
-
-using namespace cudf;
-using namespace cudf::tdigest;
+#include <arrow/util/tdigest.h>
 
-std::unique_ptr<column> arrow_percentile_approx(column_view const& _values,
-                                                int delta,
-                                                std::vector<double> const& percentages)
+std::unique_ptr<cudf::column> arrow_percentile_approx(cudf::column_view const& _values,
+                                                      int delta,
+                                                      std::vector<double> const& percentages)
 {
   // sort the incoming values using the same settings that groupby does.
   // this is a little weak because null_order::AFTER is hardcoded internally to groupby.
-  table_view t({_values});
-  auto sorted_t      = cudf::sort(t, {}, {null_order::AFTER});
+  cudf::table_view t({_values});
+  auto sorted_t      = cudf::sort(t, {}, {cudf::null_order::AFTER});
   auto sorted_values = sorted_t->get_column(0).view();
 
   std::vector<double> h_values(sorted_values.size());
-  cudaMemcpy(h_values.data(),
-             sorted_values.data<double>(),
-             sizeof(double) * sorted_values.size(),
-             cudaMemcpyDeviceToHost);
+  CUDF_CUDA_TRY(cudaMemcpy(h_values.data(),
+                           sorted_values.data<double>(),
+                           sizeof(double) * sorted_values.size(),
+                           cudaMemcpyDeviceToHost));
   std::vector<char> h_validity(sorted_values.size());
   if (sorted_values.null_mask() != nullptr) {
     auto validity = cudf::mask_to_bools(sorted_values.null_mask(), 0, sorted_values.size());
-    cudaMemcpy(h_validity.data(),
-               (validity->view().data<char>()),
-               sizeof(char) * sorted_values.size(),
-               cudaMemcpyDeviceToHost);
+    CUDF_CUDA_TRY(cudaMemcpy(h_validity.data(),
+                             (validity->view().data<char>()),
+                             sizeof(char) * sorted_values.size(),
+                             cudaMemcpyDeviceToHost));
   }
 
   // generate the tdigest
@@ -79,8 +70,8 @@ std::unique_ptr<column> arrow_percentile_approx(column_view const& _values,
       return atd.Quantile(p);
     });
   cudf::test::fixed_width_column_wrapper<double> result(h_result.begin(), h_result.end());
-  cudf::test::fixed_width_column_wrapper<size_type> offsets{
-    0, static_cast<size_type>(percentages.size())};
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> offsets{
+    0, static_cast<cudf::size_type>(percentages.size())};
   return cudf::make_lists_column(1, offsets.release(), result.release(), 0, {});
 }
 
@@ -89,18 +80,18 @@ struct percentile_approx_dispatch {
     typename T,
     typename Func,
     typename std::enable_if_t<cudf::is_numeric<T>() || cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<column> operator()(Func op,
-                                     column_view const& values,
-                                     int delta,
-                                     std::vector<double> const& percentages,
-                                     size_type ulps)
+  std::unique_ptr<cudf::column> operator()(Func op,
+                                           cudf::column_view const& values,
+                                           int delta,
+                                           std::vector<double> const& percentages,
+                                           cudf::size_type ulps)
   {
     // arrow implementation.
     auto expected = [&]() {
       // we're explicitly casting back to doubles here but this is ok because that is
       // exactly what happens inside of the cudf implementation as values are processed as well. so
       // this should not affect results.
-      auto as_doubles = cudf::cast(values, data_type{type_id::FLOAT64});
+      auto as_doubles = cudf::cast(values, cudf::data_type{cudf::type_id::FLOAT64});
       return arrow_percentile_approx(*as_doubles, delta, percentages);
     }();
 
@@ -109,7 +100,7 @@ struct percentile_approx_dispatch {
 
     cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
                                                                  percentages.end());
-    tdigest_column_view tdv(*agg_result);
+    cudf::tdigest::tdigest_column_view tdv(*agg_result);
     auto result = cudf::percentile_approx(tdv, g_percentages);
 
     cudf::test::expect_columns_equivalent(
@@ -122,21 +113,21 @@ struct percentile_approx_dispatch {
     typename T,
     typename Func,
     typename std::enable_if_t<!cudf::is_numeric<T>() && !cudf::is_fixed_point<T>()>* = nullptr>
-  std::unique_ptr<column> operator()(Func op,
-                                     column_view const& values,
-                                     int delta,
-                                     std::vector<double> const& percentages,
-                                     size_type ulps)
+  std::unique_ptr<cudf::column> operator()(Func op,
+                                           cudf::column_view const& values,
+                                           int delta,
+                                           std::vector<double> const& percentages,
+                                           cudf::size_type ulps)
   {
     CUDF_FAIL("Invalid input type for percentile_approx test");
   }
 };
 
-void percentile_approx_test(column_view const& _keys,
-                            column_view const& _values,
+void percentile_approx_test(cudf::column_view const& _keys,
+                            cudf::column_view const& _values,
                             int delta,
                             std::vector<double> const& percentages,
-                            size_type ulps)
+                            cudf::size_type ulps)
 {
   // first pass:  validate the actual percentages we get per group.
 
@@ -146,8 +137,8 @@ void percentile_approx_test(column_view const& _keys,
   cudf::table_view v({_values});
   auto groups = pass1_gb.get_groups(v);
   // slice it all up so we have keys/columns for everything.
-  std::vector<column_view> keys;
-  std::vector<column_view> values;
+  std::vector<cudf::column_view> keys;
+  std::vector<cudf::column_view> values;
   for (size_t idx = 0; idx < groups.offsets.size() - 1; idx++) {
     auto k =
       cudf::slice(groups.keys->get_column(0), {groups.offsets[idx], groups.offsets[idx + 1]});
@@ -158,11 +149,11 @@ void percentile_approx_test(column_view const& _keys,
     values.push_back(v[0]);
   }
 
-  std::vector<std::unique_ptr<column>> groupby_parts;
-  std::vector<std::unique_ptr<column>> reduce_parts;
+  std::vector<std::unique_ptr<cudf::column>> groupby_parts;
+  std::vector<std::unique_ptr<cudf::column>> reduce_parts;
   for (size_t idx = 0; idx < values.size(); idx++) {
     // via groupby
-    auto groupby = [&](column_view const& values, int delta) {
+    auto groupby = [&](cudf::column_view const& values, int delta) {
       cudf::table_view t({keys[idx]});
       cudf::groupby::groupby gb(t);
       std::vector<cudf::groupby::aggregation_request> requests;
@@ -180,12 +171,12 @@ void percentile_approx_test(column_view const& _keys,
                                                   ulps));
 
     // via reduce
-    auto reduce = [](column_view const& values, int delta) {
+    auto reduce = [](cudf::column_view const& values, int delta) {
       // result is a scalar, but we want to extract out the underlying column
       auto scalar_result =
         cudf::reduce(values,
-                     cudf::make_tdigest_aggregation<cudf::reduce_aggregation>(delta),
-                     data_type{type_id::STRUCT});
+                     *cudf::make_tdigest_aggregation<cudf::reduce_aggregation>(delta),
+                     cudf::data_type{cudf::type_id::STRUCT});
       auto tbl = static_cast<cudf::struct_scalar const*>(scalar_result.get())->view();
       std::vector<std::unique_ptr<cudf::column>> cols;
       std::transform(
@@ -206,11 +197,11 @@ void percentile_approx_test(column_view const& _keys,
 
   // second pass. run the percentile_approx with all the keys in one pass and make sure we get the
   // same results as the concatenated by-key results.
-  std::vector<column_view> part_views;
+  std::vector<cudf::column_view> part_views;
   std::transform(groupby_parts.begin(),
                  groupby_parts.end(),
                  std::back_inserter(part_views),
-                 [](std::unique_ptr<column> const& c) { return c->view(); });
+                 [](std::unique_ptr<cudf::column> const& c) { return c->view(); });
   auto expected = cudf::concatenate(part_views);
 
   cudf::groupby::groupby gb(k);
@@ -222,22 +213,20 @@ void percentile_approx_test(column_view const& _keys,
 
   cudf::test::fixed_width_column_wrapper<double> g_percentages(percentages.begin(),
                                                                percentages.end());
-  tdigest_column_view tdv(*(gb_result.second[0].results[0]));
+  cudf::tdigest::tdigest_column_view tdv(*(gb_result.second[0].results[0]));
   auto result = cudf::percentile_approx(tdv, g_percentages);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*expected, *result);
 }
 
-void simple_test(data_type input_type, std::vector<std::pair<int, int>> params)
+void simple_test(cudf::data_type input_type, std::vector<std::pair<int, int>> params)
 {
   auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
-  thrust::fill(rmm::exec_policy(cudf::default_stream_value),
-               keys->mutable_view().template begin<int>(),
-               keys->mutable_view().template end<int>(),
-               0);
+    cudf::data_type{cudf::type_id::INT32}, values->size(), cudf::mask_state::UNALLOCATED);
+  CUDF_CUDA_TRY(
+    cudaMemset(keys->mutable_view().data<int32_t>(), 0, values->size() * sizeof(int32_t)));
 
   // runs both groupby and reduce paths
   std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
@@ -247,21 +236,22 @@ void simple_test(data_type input_type, std::vector<std::pair<int, int>> params)
 }
 
 struct group_index {
-  __device__ int operator()(int i) { return i / 150000; }
+  int32_t operator()(int32_t i) { return i / 150000; }
 };
 
-void grouped_test(data_type input_type, std::vector<std::pair<int, int>> params)
+void grouped_test(cudf::data_type input_type, std::vector<std::pair<int, int>> params)
 {
   auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
-  auto i = thrust::make_counting_iterator(0);
-  thrust::transform(rmm::exec_policy(cudf::default_stream_value),
-                    i,
-                    i + values->size(),
-                    keys->mutable_view().template begin<int>(),
-                    group_index{});
+    cudf::data_type{cudf::type_id::INT32}, values->size(), cudf::mask_state::UNALLOCATED);
+  auto i      = thrust::make_counting_iterator(0);
+  auto h_keys = std::vector<int32_t>(values->size());
+  std::transform(i, i + values->size(), h_keys.begin(), group_index{});
+  CUDF_CUDA_TRY(cudaMemcpy(keys->mutable_view().data<int32_t>(),
+                           h_keys.data(),
+                           h_keys.size() * sizeof(int32_t),
+                           cudaMemcpyHostToDevice));
 
   std::for_each(params.begin(), params.end(), [&](std::pair<int, int> const& params) {
     percentile_approx_test(
@@ -269,23 +259,21 @@ void grouped_test(data_type input_type, std::vector<std::pair<int, int>> params)
   });
 }
 
-std::pair<rmm::device_buffer, size_type> make_null_mask(column_view const& col)
+std::pair<rmm::device_buffer, cudf::size_type> make_null_mask(cudf::column_view const& col)
 {
-  return cudf::detail::valid_if(thrust::make_counting_iterator<size_type>(0),
-                                thrust::make_counting_iterator<size_type>(col.size()),
-                                [] __device__(size_type i) { return i % 2 == 0; });
+  auto itr  = cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2 == 0; });
+  auto mask = cudf::test::detail::make_null_mask(itr, itr + col.size());
+  return std::make_pair(std::move(mask), col.size() / 2);
 }
 
-void simple_with_nulls_test(data_type input_type, std::vector<std::pair<int, int>> params)
+void simple_with_nulls_test(cudf::data_type input_type, std::vector<std::pair<int, int>> params)
 {
   auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
-  thrust::fill(rmm::exec_policy(cudf::default_stream_value),
-               keys->mutable_view().template begin<int>(),
-               keys->mutable_view().template end<int>(),
-               0);
+    cudf::data_type{cudf::type_id::INT32}, values->size(), cudf::mask_state::UNALLOCATED);
+  CUDF_CUDA_TRY(
+    cudaMemset(keys->mutable_view().data<int32_t>(), 0, values->size() * sizeof(int32_t)));
 
   // add a null mask
   auto mask = make_null_mask(*values);
@@ -297,18 +285,19 @@ void simple_with_nulls_test(data_type input_type, std::vector<std::pair<int, int
   });
 }
 
-void grouped_with_nulls_test(data_type input_type, std::vector<std::pair<int, int>> params)
+void grouped_with_nulls_test(cudf::data_type input_type, std::vector<std::pair<int, int>> params)
 {
   auto values = cudf::test::generate_standardized_percentile_distribution(input_type);
   // all in the same group
   auto keys = cudf::make_fixed_width_column(
-    data_type{type_id::INT32}, values->size(), mask_state::UNALLOCATED);
-  auto i = thrust::make_counting_iterator(0);
-  thrust::transform(rmm::exec_policy(cudf::default_stream_value),
-                    i,
-                    i + values->size(),
-                    keys->mutable_view().template begin<int>(),
-                    group_index{});
+    cudf::data_type{cudf::type_id::INT32}, values->size(), cudf::mask_state::UNALLOCATED);
+  auto i      = thrust::make_counting_iterator(0);
+  auto h_keys = std::vector<int32_t>(values->size());
+  std::transform(i, i + values->size(), h_keys.begin(), group_index{});
+  CUDF_CUDA_TRY(cudaMemcpy(keys->mutable_view().data<int32_t>(),
+                           h_keys.data(),
+                           h_keys.size() * sizeof(int32_t),
+                           cudaMemcpyHostToDevice));
 
   // add a null mask
   auto mask = make_null_mask(*values);
@@ -321,10 +310,10 @@ void grouped_with_nulls_test(data_type input_type, std::vector<std::pair<int, in
 }
 
 template <typename T>
-data_type get_appropriate_type()
+cudf::data_type get_appropriate_type()
 {
-  if constexpr (cudf::is_fixed_point<T>()) { return data_type{cudf::type_to_id<T>(), -7}; }
-  return data_type{cudf::type_to_id<T>()};
+  if constexpr (cudf::is_fixed_point<T>()) { return cudf::data_type{cudf::type_to_id<T>(), -7}; }
+  return cudf::data_type{cudf::type_to_id<T>()};
 }
 
 using PercentileApproxTypes =
@@ -384,24 +373,24 @@ struct PercentileApproxTest : public cudf::test::BaseFixture {
 
 TEST_F(PercentileApproxTest, EmptyInput)
 {
-  auto empty_ = cudf::detail::tdigest::make_empty_tdigest_column();
+  auto empty_ = cudf::tdigest::detail::make_empty_tdigest_column(cudf::get_default_stream());
   cudf::test::fixed_width_column_wrapper<double> percentiles{0.0, 0.25, 0.3};
 
-  std::vector<column_view> input;
+  std::vector<cudf::column_view> input;
   input.push_back(*empty_);
   input.push_back(*empty_);
   input.push_back(*empty_);
   auto empty = cudf::concatenate(input);
 
-  tdigest_column_view tdv(*empty);
+  cudf::tdigest::tdigest_column_view tdv(*empty);
   auto result = cudf::percentile_approx(tdv, percentiles);
 
-  cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0, 0};
+  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{0, 0, 0, 0};
   std::vector<bool> nulls{0, 0, 0};
   auto expected =
     cudf::make_lists_column(3,
                             offsets.release(),
-                            cudf::make_empty_column(type_id::FLOAT64),
+                            cudf::make_empty_column(cudf::type_id::FLOAT64),
                             3,
                             cudf::test::detail::make_null_mask(nulls.begin(), nulls.end()));
 
@@ -424,15 +413,18 @@ TEST_F(PercentileApproxTest, EmptyPercentiles)
 
   cudf::test::fixed_width_column_wrapper<double> percentiles{};
 
-  tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
+  cudf::tdigest::tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
   auto result = cudf::percentile_approx(tdv, percentiles);
 
-  cudf::test::fixed_width_column_wrapper<offset_type> offsets{0, 0, 0};
-  auto expected = cudf::make_lists_column(2,
-                                          offsets.release(),
-                                          cudf::make_empty_column(type_id::FLOAT64),
-                                          2,
-                                          cudf::detail::create_null_mask(2, mask_state::ALL_NULL));
+  cudf::test::fixed_width_column_wrapper<cudf::offset_type> offsets{0, 0, 0};
+  std::vector<bool> nulls{0, 0};
+  auto expected =
+    cudf::make_lists_column(2,
+                            offsets.release(),
+                            cudf::make_empty_column(cudf::type_id::FLOAT64),
+                            2,
+                            cudf::test::detail::make_null_mask(nulls.begin(), nulls.end()));
+  // cudf::detail::create_null_mask(2, cudf::mask_state::ALL_NULL, cudf::get_default_stream()));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*result, *expected);
 }
@@ -451,7 +443,7 @@ TEST_F(PercentileApproxTest, NullPercentiles)
   requests.push_back({values, std::move(aggregations)});
   auto tdigest_column = gb.aggregate(requests);
 
-  tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
+  cudf::tdigest::tdigest_column_view tdv(*tdigest_column.second[0].results[0]);
 
   cudf::test::fixed_width_column_wrapper<double> npercentiles{{0.5, 0.5, 1.0, 1.0}, {0, 0, 1, 1}};
   auto result = cudf::percentile_approx(tdv, npercentiles);
diff --git a/cpp/tests/quantiles/quantile_test.cpp b/cpp/tests/quantiles/quantile_test.cpp
index 20acdd02a93..6dfe4f5169b 100644
--- a/cpp/tests/quantiles/quantile_test.cpp
+++ b/cpp/tests/quantiles/quantile_test.cpp
@@ -14,26 +14,21 @@
  * limitations under the License.
  */
 
-#include <cudf/quantiles.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_list_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
+
+#include <cudf/quantiles.hpp>
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
+
 #include <limits>
 #include <memory>
 #include <type_traits>
 #include <vector>
 
-using namespace cudf::test;
-
-using cudf::null_order;
-using cudf::order;
-using std::vector;
-
 namespace {
 struct q_res {
   q_res(double value, bool is_valid = true) : is_valid(is_valid), value(value) {}
@@ -77,9 +72,9 @@ struct q_expect {
 
 template <typename T>
 struct test_case {
-  fixed_width_column_wrapper<T> column;
-  vector<q_expect> expectations;
-  fixed_width_column_wrapper<cudf::size_type> ordered_indices;
+  cudf::test::fixed_width_column_wrapper<T> column;
+  std::vector<q_expect> expectations;
+  cudf::test::fixed_width_column_wrapper<cudf::size_type> ordered_indices;
 };
 
 // interpolate_center
@@ -104,7 +99,7 @@ test_case<T> interpolate_center()
   }();
   auto max_d = static_cast<double>(max);
   auto low_d = static_cast<double>(low);
-  return test_case<T>{fixed_width_column_wrapper<T>({low, max}),
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({low, max}),
                       {q_expect{0.50, max_d, low_d, lin_d, mid_d, low_d}}};
 }
 
@@ -116,7 +111,7 @@ test_case<bool> interpolate_center()
   auto mid_d = 0.5;
   auto low_d = static_cast<double>(low);
   auto max_d = static_cast<double>(max);
-  return test_case<bool>{fixed_width_column_wrapper<bool>({low, max}),
+  return test_case<bool>{cudf::test::fixed_width_column_wrapper<bool>({low, max}),
                          {q_expect{0.5, max_d, low_d, mid_d, mid_d, low_d}}};
 }
 
@@ -130,7 +125,7 @@ test_case<T> interpolate_extrema_high()
   auto low_d   = static_cast<double>(low);
   auto max_d   = static_cast<double>(max);
   auto exact_d = static_cast<double>(max - 1);
-  return test_case<T>{fixed_width_column_wrapper<T>({low, max}),
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({low, max}),
                       {q_expect{0.50, max_d, low_d, exact_d, exact_d, low_d}}};
 }
 
@@ -151,7 +146,7 @@ test_case<T> interpolate_extrema_low()
   auto a_d     = static_cast<double>(a);
   auto b_d     = static_cast<double>(b);
   auto exact_d = static_cast<double>(a + 1);
-  return test_case<T>{fixed_width_column_wrapper<T>({a, b}),
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({a, b}),
                       {q_expect{0.50, b_d, a_d, exact_d, exact_d, a_d}}};
 }
 
@@ -166,7 +161,7 @@ test_case<bool> interpolate_extrema_low<bool>()
 template <typename T>
 std::enable_if_t<std::is_floating_point_v<T>, test_case<T>> single()
 {
-  return test_case<T>{fixed_width_column_wrapper<T>({7.309999942779541}),
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({7.309999942779541}),
                       {
                         q_expect{
                           -1.0,
@@ -198,13 +193,15 @@ std::enable_if_t<std::is_floating_point_v<T>, test_case<T>> single()
 template <typename T>
 std::enable_if_t<std::is_integral_v<T> and not cudf::is_boolean<T>(), test_case<T>> single()
 {
-  return test_case<T>{fixed_width_column_wrapper<T>({1}), {q_expect{0.7, 1, 1, 1, 1, 1}}};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({1}),
+                      {q_expect{0.7, 1, 1, 1, 1, 1}}};
 }
 
 template <typename T>
 std::enable_if_t<cudf::is_boolean<T>(), test_case<T>> single()
 {
-  return test_case<T>{fixed_width_column_wrapper<T>({1}), {q_expect{0.7, 1.0, 1.0, 1.0, 1.0, 1.0}}};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({1}),
+                      {q_expect{0.7, 1.0, 1.0, 1.0, 1.0, 1.0}}};
 }
 
 // all_invalid
@@ -213,25 +210,25 @@ template <typename T>
 std::enable_if_t<std::is_floating_point_v<T>, test_case<T>> all_invalid()
 {
   return test_case<T>{
-    fixed_width_column_wrapper<T>({6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7},
-                                  {0, 0, 0, 0, 0, 0, 0, 0, 0}),
+    cudf::test::fixed_width_column_wrapper<T>({6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.01, 0.8, 5.7},
+                                              {0, 0, 0, 0, 0, 0, 0, 0, 0}),
     {q_expect{-1.0}, q_expect{0.0}, q_expect{0.5}, q_expect{1.0}, q_expect{2.0}}};
 }
 
 template <typename T>
 std::enable_if_t<std::is_integral_v<T> and not cudf::is_boolean<T>(), test_case<T>> all_invalid()
 {
-  return test_case<T>{
-    fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6}, {0, 0, 0, 0, 0, 0, 0, 0, 0}),
-    {q_expect{0.7}}};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6},
+                                                                {0, 0, 0, 0, 0, 0, 0, 0, 0}),
+                      {q_expect{0.7}}};
 }
 
 template <typename T>
 std::enable_if_t<cudf::is_boolean<T>(), test_case<T>> all_invalid()
 {
-  return test_case<T>{
-    fixed_width_column_wrapper<T>({1, 0, 1, 1, 0, 1, 0, 1, 1}, {0, 0, 0, 0, 0, 0, 0, 0, 0}),
-    {q_expect{0.7}}};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({1, 0, 1, 1, 0, 1, 0, 1, 1},
+                                                                {0, 0, 0, 0, 0, 0, 0, 0, 0}),
+                      {q_expect{0.7}}};
 }
 
 // some invalid
@@ -244,14 +241,14 @@ std::enable_if_t<std::is_same_v<T, double>, test_case<T>> some_invalid()
   T mid  = -0.432;
   T lin  = -0.432;
   return test_case<T>{
-    fixed_width_column_wrapper<T>({6.8, high, 3.4, 4.17, 2.13, 1.11, low, 0.8, 5.7},
-                                  {0, 1, 0, 0, 0, 0, 1, 0, 0}),
+    cudf::test::fixed_width_column_wrapper<T>({6.8, high, 3.4, 4.17, 2.13, 1.11, low, 0.8, 5.7},
+                                              {0, 1, 0, 0, 0, 0, 1, 0, 0}),
     {q_expect{-1.0, low, low, low, low, low},
      q_expect{0.0, low, low, low, low, low},
      q_expect{0.5, high, low, lin, mid, low},
      q_expect{1.0, high, high, high, high, high},
      q_expect{2.0, high, high, high, high, high}},
-    fixed_width_column_wrapper<cudf::size_type>({6, 1})};
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({6, 1})};
 }
 
 template <typename T>
@@ -261,7 +258,7 @@ std::enable_if_t<std::is_same_v<T, float>, test_case<T>> some_invalid()
   T low      = -1.024;
   double mid = -0.43200002610683441;
   double lin = -0.43200002610683441;
-  return test_case<T>{fixed_width_column_wrapper<T>(
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>(
                         {T(6.8), high, T(3.4), T(4.17), T(2.13), T(1.11), low, T(0.8), T(5.7)},
                         {0, 1, 0, 0, 0, 0, 1, 0, 0}),
                       {q_expect{-1.0, low, low, low, low, low},
@@ -269,29 +266,29 @@ std::enable_if_t<std::is_same_v<T, float>, test_case<T>> some_invalid()
                        q_expect{0.5, high, low, lin, mid, low},
                        q_expect{1.0, high, high, high, high, high},
                        q_expect{2.0, high, high, high, high, high}},
-                      fixed_width_column_wrapper<cudf::size_type>({6, 1})};
+                      cudf::test::fixed_width_column_wrapper<cudf::size_type>({6, 1})};
 }
 
 template <typename T>
 std::enable_if_t<std::is_integral_v<T> and not cudf::is_boolean<T>(), test_case<T>> some_invalid()
 {
-  return test_case<T>{
-    fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6}, {0, 0, 1, 0, 0, 0, 0, 0, 1}),
-    {q_expect{0.0, 3.0, 3.0, 3.0, 3.0, 3.0},
-     q_expect{0.5, 6.0, 3.0, 4.5, 4.5, 3.0},
-     q_expect{1.0, 6.0, 6.0, 6.0, 6.0, 6.0}},
-    fixed_width_column_wrapper<cudf::size_type>({2, 8})};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6},
+                                                                {0, 0, 1, 0, 0, 0, 0, 0, 1}),
+                      {q_expect{0.0, 3.0, 3.0, 3.0, 3.0, 3.0},
+                       q_expect{0.5, 6.0, 3.0, 4.5, 4.5, 3.0},
+                       q_expect{1.0, 6.0, 6.0, 6.0, 6.0, 6.0}},
+                      cudf::test::fixed_width_column_wrapper<cudf::size_type>({2, 8})};
 }
 
 template <typename T>
 std::enable_if_t<cudf::is_boolean<T>(), test_case<T>> some_invalid()
 {
-  return test_case<T>{
-    fixed_width_column_wrapper<T>({1, 0, 1, 1, 0, 1, 0, 1, 1}, {0, 0, 1, 0, 1, 0, 0, 0, 0}),
-    {q_expect{0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
-     q_expect{0.5, 1.0, 0.0, 0.5, 0.5, 0.0},
-     q_expect{1.0, 1.0, 1.0, 1.0, 1.0, 1.0}},
-    fixed_width_column_wrapper<cudf::size_type>({4, 2})};
+  return test_case<T>{cudf::test::fixed_width_column_wrapper<T>({1, 0, 1, 1, 0, 1, 0, 1, 1},
+                                                                {0, 0, 1, 0, 1, 0, 0, 0, 0}),
+                      {q_expect{0.0, 0.0, 0.0, 0.0, 0.0, 0.0},
+                       q_expect{0.5, 1.0, 0.0, 0.5, 0.5, 0.0},
+                       q_expect{1.0, 1.0, 1.0, 1.0, 1.0, 1.0}},
+                      cudf::test::fixed_width_column_wrapper<cudf::size_type>({4, 2})};
 }
 
 // unsorted
@@ -300,38 +297,41 @@ template <typename T>
 std::enable_if_t<std::is_floating_point_v<T>, test_case<T>> unsorted()
 {
   return test_case<T>{
-    fixed_width_column_wrapper<T>({6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.00, 0.8, 5.7}),
+    cudf::test::fixed_width_column_wrapper<T>({6.8, 0.15, 3.4, 4.17, 2.13, 1.11, -1.00, 0.8, 5.7}),
     {
       q_expect{0.0, -1.00, -1.00, -1.00, -1.00, -1.00},
     },
-    fixed_width_column_wrapper<cudf::size_type>({6, 1, 7, 5, 4, 2, 3, 8, 0})};
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({6, 1, 7, 5, 4, 2, 3, 8, 0})};
 }
 
 template <typename T>
 std::enable_if_t<std::is_integral_v<T> and not cudf::is_boolean<T>(), test_case<T>> unsorted()
 {
   return std::is_signed<T>()
-           ? test_case<T>{fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6}),
+           ? test_case<T>{cudf::test::fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, -1, 1, 6}),
                           {q_expect{0.0, -1, -1, -1, -1, -1}},
-                          fixed_width_column_wrapper<cudf::size_type>({6, 1, 7, 5, 4, 2, 3, 8, 0})}
-           : test_case<T>{fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, 1, 1, 6}),
+                          cudf::test::fixed_width_column_wrapper<cudf::size_type>(
+                            {6, 1, 7, 5, 4, 2, 3, 8, 0})}
+           : test_case<T>{cudf::test::fixed_width_column_wrapper<T>({6, 0, 3, 4, 2, 1, 1, 1, 6}),
                           {q_expect{0.0, 1, 1, 1, 1, 1}},
-                          fixed_width_column_wrapper<cudf::size_type>({6, 1, 7, 5, 4, 2, 3, 8, 0})};
+                          cudf::test::fixed_width_column_wrapper<cudf::size_type>(
+                            {6, 1, 7, 5, 4, 2, 3, 8, 0})};
 }
 
 template <typename T>
 std::enable_if_t<cudf::is_boolean<T>(), test_case<T>> unsorted()
 {
-  return test_case<T>{fixed_width_column_wrapper<T>({0, 0, 1, 1, 0, 1, 1, 0, 1}),
-                      {q_expect{
-                        0.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                        0.0,
-                      }},
-                      fixed_width_column_wrapper<cudf::size_type>({0, 1, 4, 7, 2, 3, 5, 6, 9})};
+  return test_case<T>{
+    cudf::test::fixed_width_column_wrapper<T>({0, 0, 1, 1, 0, 1, 1, 0, 1}),
+    {q_expect{
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+      0.0,
+    }},
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>({0, 1, 4, 7, 2, 3, 5, 6, 9})};
 }
 
 }  // namespace testdata
@@ -342,40 +342,39 @@ std::enable_if_t<cudf::is_boolean<T>(), test_case<T>> unsorted()
 template <typename T>
 void test(testdata::test_case<T> test_case)
 {
-  using namespace cudf;
-
   for (auto& expected : test_case.expectations) {
     auto q = std::vector<double>{expected.quantile};
 
     auto nullable = static_cast<cudf::column_view>(test_case.column).nullable();
 
     auto make_expected_column = [nullable](q_res expected) {
-      return nullable ? fixed_width_column_wrapper<double>({expected.value}, {expected.is_valid})
-                      : fixed_width_column_wrapper<double>({expected.value});
+      return nullable ? cudf::test::fixed_width_column_wrapper<double>({expected.value},
+                                                                       {expected.is_valid})
+                      : cudf::test::fixed_width_column_wrapper<double>({expected.value});
     };
 
     auto actual_higher =
-      quantile(test_case.column, q, interpolation::HIGHER, test_case.ordered_indices);
+      cudf::quantile(test_case.column, q, cudf::interpolation::HIGHER, test_case.ordered_indices);
     auto expected_higher_col = make_expected_column(expected.higher);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_higher_col, actual_higher->view());
 
     auto actual_lower =
-      quantile(test_case.column, q, interpolation::LOWER, test_case.ordered_indices);
+      cudf::quantile(test_case.column, q, cudf::interpolation::LOWER, test_case.ordered_indices);
     auto expected_lower_col = make_expected_column(expected.lower);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_lower_col, actual_lower->view());
 
     auto actual_linear =
-      quantile(test_case.column, q, interpolation::LINEAR, test_case.ordered_indices);
+      cudf::quantile(test_case.column, q, cudf::interpolation::LINEAR, test_case.ordered_indices);
     auto expected_linear_col = make_expected_column(expected.linear);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_linear_col, actual_linear->view());
 
     auto actual_midpoint =
-      quantile(test_case.column, q, interpolation::MIDPOINT, test_case.ordered_indices);
+      cudf::quantile(test_case.column, q, cudf::interpolation::MIDPOINT, test_case.ordered_indices);
     auto expected_midpoint_col = make_expected_column(expected.midpoint);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_midpoint_col, actual_midpoint->view());
 
     auto actual_nearest =
-      quantile(test_case.column, q, interpolation::NEAREST, test_case.ordered_indices);
+      cudf::quantile(test_case.column, q, cudf::interpolation::NEAREST, test_case.ordered_indices);
     auto expected_nearest_col = make_expected_column(expected.nearest);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected_nearest_col, actual_nearest->view());
   }
@@ -385,10 +384,10 @@ void test(testdata::test_case<T> test_case)
 // ----- tests -----------------------------------------------------------------
 
 template <typename T>
-struct QuantileTest : public BaseFixture {
+struct QuantileTest : public cudf::test::BaseFixture {
 };
 
-using TestTypes = NumericTypes;
+using TestTypes = cudf::test::NumericTypes;
 TYPED_TEST_SUITE(QuantileTest, TestTypes);
 
 TYPED_TEST(QuantileTest, TestSingle) { test(testdata::single<TypeParam>()); }
@@ -413,60 +412,65 @@ TYPED_TEST(QuantileTest, TestInterpolateExtremaLow)
 
 TYPED_TEST(QuantileTest, TestEmpty)
 {
-  auto input    = fixed_width_column_wrapper<TypeParam>({});
+  auto input    = cudf::test::fixed_width_column_wrapper<TypeParam>({});
   auto expected = cudf::test::fixed_width_column_wrapper<double>({0, 0}, {0, 0});
   auto actual   = cudf::quantile(input, {0.5, 0.25});
 }
 
 template <typename T>
-struct QuantileUnsupportedTypesTest : public BaseFixture {
+struct QuantileUnsupportedTypesTest : public cudf::test::BaseFixture {
 };
 
 // TODO add tests for FixedPointTypes
-using UnsupportedTestTypes = RemoveIf<ContainedIn<Concat<TestTypes, FixedPointTypes>>, AllTypes>;
+using UnsupportedTestTypes = cudf::test::RemoveIf<
+  cudf::test::ContainedIn<cudf::test::Concat<TestTypes, cudf::test::FixedPointTypes>>,
+  cudf::test::AllTypes>;
 TYPED_TEST_SUITE(QuantileUnsupportedTypesTest, UnsupportedTestTypes);
 
 TYPED_TEST(QuantileUnsupportedTypesTest, TestZeroElements)
 {
-  fixed_width_column_wrapper<TypeParam> input({});
+  cudf::test::fixed_width_column_wrapper<TypeParam> input({});
 
   EXPECT_THROW(cudf::quantile(input, {0}), cudf::logic_error);
 }
 
 TYPED_TEST(QuantileUnsupportedTypesTest, TestOneElements)
 {
-  fixed_width_column_wrapper<TypeParam, int32_t> input({0});
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> input({0});
 
   EXPECT_THROW(cudf::quantile(input, {0}), cudf::logic_error);
 }
 
 TYPED_TEST(QuantileUnsupportedTypesTest, TestMultipleElements)
 {
-  fixed_width_column_wrapper<TypeParam, int32_t> input({0, 1, 2});
+  cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> input({0, 1, 2});
 
   EXPECT_THROW(cudf::quantile(input, {0}), cudf::logic_error);
 }
 
-struct QuantileDictionaryTest : public BaseFixture {
+struct QuantileDictionaryTest : public cudf::test::BaseFixture {
 };
 
 TEST_F(QuantileDictionaryTest, TestValid)
 {
-  dictionary_column_wrapper<int32_t> col{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
-  fixed_width_column_wrapper<int32_t> indices{0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
+  cudf::test::dictionary_column_wrapper<int32_t> col{1, 2, 3, 4, 5, 6, 7, 8, 9, 10};
+  cudf::test::fixed_width_column_wrapper<int32_t> indices{0, 2, 4, 6, 8, 1, 3, 5, 7, 9};
 
   auto result = cudf::quantile(col, {0.5}, cudf::interpolation::LINEAR);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), fixed_width_column_wrapper<double>{5.5});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(),
+                                      cudf::test::fixed_width_column_wrapper<double>{5.5});
 
   result = cudf::quantile(col, {0.5}, cudf::interpolation::LINEAR, indices);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), fixed_width_column_wrapper<double>{5.5});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(),
+                                      cudf::test::fixed_width_column_wrapper<double>{5.5});
 
   result = cudf::quantile(col, {0.1, 0.2}, cudf::interpolation::HIGHER);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), fixed_width_column_wrapper<double>{2.0, 3.0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(),
+                                      cudf::test::fixed_width_column_wrapper<double>{2.0, 3.0});
 
   result = cudf::quantile(col, {0.25, 0.5, 0.75}, cudf::interpolation::MIDPOINT);
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(),
-                                      fixed_width_column_wrapper<double>{3.5, 5.5, 7.5});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    result->view(), cudf::test::fixed_width_column_wrapper<double>{3.5, 5.5, 7.5});
 };
 
 }  // anonymous namespace
diff --git a/cpp/tests/quantiles/quantiles_test.cpp b/cpp/tests/quantiles/quantiles_test.cpp
index b4d1b9984ab..f532e93c6c2 100644
--- a/cpp/tests/quantiles/quantiles_test.cpp
+++ b/cpp/tests/quantiles/quantiles_test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -20,52 +20,45 @@
 #include <cudf_test/table_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
 
-#include <cudf/table/table.hpp>
-#include <cudf/table/table_view.hpp>
-#include <cudf/types.hpp>
-
 #include <cudf/column/column_view.hpp>
-#include <cudf/copying.hpp>
 #include <cudf/quantiles.hpp>
-#include <cudf/utilities/error.hpp>
-
-using namespace cudf;
-using namespace test;
+#include <cudf/table/table_view.hpp>
+#include <cudf/types.hpp>
 
 template <typename T>
-struct QuantilesTest : public BaseFixture {
+struct QuantilesTest : public cudf::test::BaseFixture {
 };
 
-using TestTypes = AllTypes;
+using TestTypes = cudf::test::AllTypes;
 
 TYPED_TEST_SUITE(QuantilesTest, TestTypes);
 
 TYPED_TEST(QuantilesTest, TestZeroColumns)
 {
-  auto input = table_view(std::vector<column_view>{});
+  auto input = cudf::table_view(std::vector<cudf::column_view>{});
 
-  EXPECT_THROW(quantiles(input, {0.0f}), logic_error);
+  EXPECT_THROW(cudf::quantiles(input, {0.0f}), cudf::logic_error);
 }
 
 TYPED_TEST(QuantilesTest, TestMultiColumnZeroRows)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> input_a({});
-  auto input = table_view({input_a});
+  cudf::test::fixed_width_column_wrapper<T> input_a({});
+  auto input = cudf::table_view({input_a});
 
-  EXPECT_THROW(quantiles(input, {0.0f}), logic_error);
+  EXPECT_THROW(cudf::quantiles(input, {0.0f}), cudf::logic_error);
 }
 
 TYPED_TEST(QuantilesTest, TestZeroRequestedQuantiles)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T, int32_t> input_a({1}, {1});
-  auto input = table_view(std::vector<column_view>{input_a});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> input_a({1}, {1});
+  auto input = cudf::table_view(std::vector<cudf::column_view>{input_a});
 
-  auto actual   = quantiles(input, {});
-  auto expected = empty_like(input);
+  auto actual   = cudf::quantiles(input, {});
+  auto expected = cudf::empty_like(input);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected->view(), actual->view());
 }
@@ -74,75 +67,75 @@ TYPED_TEST(QuantilesTest, TestMultiColumnOrderCountMismatch)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> input_a({});
-  fixed_width_column_wrapper<T> input_b({});
-  auto input = table_view({input_a});
-
-  EXPECT_THROW(quantiles(input,
-                         {0.0f},
-                         interpolation::NEAREST,
-                         sorted::NO,
-                         {order::ASCENDING},
-                         {null_order::AFTER, null_order::AFTER}),
-               logic_error);
+  cudf::test::fixed_width_column_wrapper<T> input_a({});
+  cudf::test::fixed_width_column_wrapper<T> input_b({});
+  auto input = cudf::table_view({input_a});
+
+  EXPECT_THROW(cudf::quantiles(input,
+                               {0.0f},
+                               cudf::interpolation::NEAREST,
+                               cudf::sorted::NO,
+                               {cudf::order::ASCENDING},
+                               {cudf::null_order::AFTER, cudf::null_order::AFTER}),
+               cudf::logic_error);
 }
 
 TYPED_TEST(QuantilesTest, TestMultiColumnNullOrderCountMismatch)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> input_a({});
-  fixed_width_column_wrapper<T> input_b({});
-  auto input = table_view({input_a});
-
-  EXPECT_THROW(quantiles(input,
-                         {0.0f},
-                         interpolation::NEAREST,
-                         sorted::NO,
-                         {order::ASCENDING, order::ASCENDING},
-                         {null_order::AFTER}),
-               logic_error);
+  cudf::test::fixed_width_column_wrapper<T> input_a({});
+  cudf::test::fixed_width_column_wrapper<T> input_b({});
+  auto input = cudf::table_view({input_a});
+
+  EXPECT_THROW(cudf::quantiles(input,
+                               {0.0f},
+                               cudf::interpolation::NEAREST,
+                               cudf::sorted::NO,
+                               {cudf::order::ASCENDING, cudf::order::ASCENDING},
+                               {cudf::null_order::AFTER}),
+               cudf::logic_error);
 }
 
 TYPED_TEST(QuantilesTest, TestMultiColumnArithmeticInterpolation)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> input_a({});
-  fixed_width_column_wrapper<T> input_b({});
-  auto input = table_view({input_a});
+  cudf::test::fixed_width_column_wrapper<T> input_a({});
+  cudf::test::fixed_width_column_wrapper<T> input_b({});
+  auto input = cudf::table_view({input_a});
 
-  EXPECT_THROW(quantiles(input, {0.0f}, interpolation::LINEAR), logic_error);
+  EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::LINEAR), cudf::logic_error);
 
-  EXPECT_THROW(quantiles(input, {0.0f}, interpolation::MIDPOINT), logic_error);
+  EXPECT_THROW(cudf::quantiles(input, {0.0f}, cudf::interpolation::MIDPOINT), cudf::logic_error);
 }
 
 TYPED_TEST(QuantilesTest, TestMultiColumnUnsorted)
 {
   using T = TypeParam;
 
-  auto input_a = strings_column_wrapper(
+  auto input_a = cudf::test::strings_column_wrapper(
     {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C",
      "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"},
     {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  fixed_width_column_wrapper<T, int32_t> input_b(
+  cudf::test::fixed_width_column_wrapper<T, int32_t> input_b(
     {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2},
     {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto input = table_view({input_a, input_b});
+  auto input = cudf::table_view({input_a, input_b});
 
-  auto actual = quantiles(input,
-                          {0.0f, 0.5f, 0.7f, 0.25f, 1.0f},
-                          interpolation::NEAREST,
-                          sorted::NO,
-                          {order::ASCENDING, order::DESCENDING});
+  auto actual = cudf::quantiles(input,
+                                {0.0f, 0.5f, 0.7f, 0.25f, 1.0f},
+                                cudf::interpolation::NEAREST,
+                                cudf::sorted::NO,
+                                {cudf::order::ASCENDING, cudf::order::DESCENDING});
 
-  auto expected_a = strings_column_wrapper({"A", "C", "C", "B", "D"}, {1, 1, 1, 1, 1});
+  auto expected_a = cudf::test::strings_column_wrapper({"A", "C", "C", "B", "D"}, {1, 1, 1, 1, 1});
 
-  fixed_width_column_wrapper<T, int32_t> expected_b({5, 5, 1, 5, 0}, {1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> expected_b({5, 5, 1, 5, 0}, {1, 1, 1, 1, 1});
 
-  auto expected = table_view({expected_a, expected_b});
+  auto expected = cudf::table_view({expected_a, expected_b});
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, actual->view());
 }
@@ -151,25 +144,25 @@ TYPED_TEST(QuantilesTest, TestMultiColumnAssumedSorted)
 {
   using T = TypeParam;
 
-  auto input_a = strings_column_wrapper(
+  auto input_a = cudf::test::strings_column_wrapper(
     {"C", "B", "A", "A", "D", "B", "D", "B", "D", "C", "C", "C",
      "D", "B", "D", "B", "C", "C", "A", "D", "B", "A", "A", "A"},
     {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  fixed_width_column_wrapper<T, int32_t> input_b(
+  cudf::test::fixed_width_column_wrapper<T, int32_t> input_b(
     {4, 3, 5, 0, 1, 0, 4, 1, 5, 3, 0, 5, 2, 4, 3, 2, 1, 2, 3, 0, 5, 1, 4, 2},
     {1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
 
-  auto input = table_view({input_a, input_b});
+  auto input = cudf::table_view({input_a, input_b});
 
-  auto actual =
-    quantiles(input, {0.0f, 0.5f, 0.7f, 0.25f, 1.0f}, interpolation::NEAREST, sorted::YES);
+  auto actual = cudf::quantiles(
+    input, {0.0f, 0.5f, 0.7f, 0.25f, 1.0f}, cudf::interpolation::NEAREST, cudf::sorted::YES);
 
-  auto expected_a = strings_column_wrapper({"C", "D", "C", "D", "A"}, {1, 1, 1, 1, 1});
+  auto expected_a = cudf::test::strings_column_wrapper({"C", "D", "C", "D", "A"}, {1, 1, 1, 1, 1});
 
-  fixed_width_column_wrapper<T, int32_t> expected_b({4, 2, 1, 4, 2}, {1, 1, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> expected_b({4, 2, 1, 4, 2}, {1, 1, 1, 1, 1});
 
-  auto expected = table_view({expected_a, expected_b});
+  auto expected = cudf::table_view({expected_a, expected_b});
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected, actual->view());
 }
diff --git a/cpp/tests/reductions/collect_ops_tests.cpp b/cpp/tests/reductions/collect_ops_tests.cpp
index a0fdab5e994..90014c3b10f 100644
--- a/cpp/tests/reductions/collect_ops_tests.cpp
+++ b/cpp/tests/reductions/collect_ops_tests.cpp
@@ -31,7 +31,7 @@ namespace {
 
 auto collect_set(cudf::column_view const& input, std::unique_ptr<reduce_aggregation> const& agg)
 {
-  auto const result_scalar = cudf::reduce(input, agg, data_type{type_id::LIST});
+  auto const result_scalar = cudf::reduce(input, *agg, data_type{type_id::LIST});
 
   // The results of `collect_set` are unordered thus we need to sort them for comparison.
   auto const result_sorted_table =
@@ -63,20 +63,20 @@ TYPED_TEST(CollectTestFixedWidth, CollectList)
   // null_include without nulls
   fw_wrapper col(values.begin(), values.end());
   auto const ret = cudf::reduce(
-    col, make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+    col, *make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(col, dynamic_cast<list_scalar*>(ret.get())->view());
 
   // null_include with nulls
   fw_wrapper col_with_null(values.begin(), values.end(), null_mask.begin());
   auto const ret1 = cudf::reduce(
-    col_with_null, make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+    col_with_null, *make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_with_null, dynamic_cast<list_scalar*>(ret1.get())->view());
 
   // null_exclude with nulls
   fw_wrapper col_null_filtered{{5, 0, -111, 0, 64, 99, -16}};
   auto const ret2 =
     cudf::reduce(col_with_null,
-                 make_collect_list_aggregation<reduce_aggregation>(null_policy::EXCLUDE),
+                 *make_collect_list_aggregation<reduce_aggregation>(null_policy::EXCLUDE),
                  data_type{type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(col_null_filtered, dynamic_cast<list_scalar*>(ret2.get())->view());
 }
@@ -128,7 +128,7 @@ TYPED_TEST(CollectTestFixedWidth, MergeLists)
   auto const lists1    = lists_col{{1, 2, 3}, {}, {}, {4}, {5, 6, 7}, {8, 9}, {}};
   auto const expected1 = fw_wrapper{{1, 2, 3, 4, 5, 6, 7, 8, 9}};
   auto const ret1      = cudf::reduce(
-    lists1, make_merge_lists_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+    lists1, *make_merge_lists_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, dynamic_cast<list_scalar*>(ret1.get())->view());
 
   // test with nulls
@@ -145,7 +145,7 @@ TYPED_TEST(CollectTestFixedWidth, MergeLists)
   auto const expected2 = fw_wrapper{{1, 2, 3, 0, 4, 0, 5, 0, 0, 0, 6, 7, 8, 9},
                                     {1, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 1, 1}};
   auto const ret2      = cudf::reduce(
-    lists2, make_merge_lists_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+    lists2, *make_merge_lists_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<list_scalar*>(ret2.get())->view());
 }
 
@@ -196,15 +196,19 @@ TEST_F(CollectTest, CollectSetWithNaN)
   // nan unequal with null equal
   fp_wrapper expected1{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f},
                        {1, 1, 1, 1, 1, 1, 1, 0}};
-  auto const ret1 = collect_set(col, make_collect_set_aggregation<reduce_aggregation>());
+  auto const ret1 =
+    collect_set(col,
+                make_collect_set_aggregation<reduce_aggregation>(
+                  null_policy::INCLUDE, null_equality::EQUAL, nan_equality::UNEQUAL));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, dynamic_cast<list_scalar*>(ret1.get())->view());
 
   // nan unequal with null unequal
   fp_wrapper expected2{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, -NAN, NAN, NAN, 0.0f, 0.0f},
                        {1, 1, 1, 1, 1, 1, 1, 0, 0}};
-  auto const ret2 = collect_set(
-    col,
-    make_collect_set_aggregation<reduce_aggregation>(null_policy::INCLUDE, null_equality::UNEQUAL));
+  auto const ret2 =
+    collect_set(col,
+                make_collect_set_aggregation<reduce_aggregation>(
+                  null_policy::INCLUDE, null_equality::UNEQUAL, nan_equality::UNEQUAL));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<list_scalar*>(ret2.get())->view());
 
   // nan equal with null equal
@@ -239,14 +243,17 @@ TEST_F(CollectTest, MergeSetsWithNaN)
 
   // nan unequal with null equal
   fp_wrapper expected1{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, NAN, NAN, 0.0f}, {1, 1, 1, 1, 1, 1, 0}};
-  auto const ret1 = collect_set(col, make_merge_sets_aggregation<reduce_aggregation>());
+  auto const ret1 = collect_set(
+    col,
+    make_merge_sets_aggregation<reduce_aggregation>(null_equality::EQUAL, nan_equality::UNEQUAL));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, dynamic_cast<list_scalar*>(ret1.get())->view());
 
   // nan unequal with null unequal
   fp_wrapper expected2{{-2.3e-5f, 1.0f, 2.3e5f, -NAN, NAN, NAN, 0.0f, 0.0f, 0.0f},
                        {1, 1, 1, 1, 1, 1, 0, 0, 0}};
-  auto const ret2 =
-    collect_set(col, make_merge_sets_aggregation<reduce_aggregation>(null_equality::UNEQUAL));
+  auto const ret2 = collect_set(
+    col,
+    make_merge_sets_aggregation<reduce_aggregation>(null_equality::UNEQUAL, nan_equality::UNEQUAL));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<list_scalar*>(ret2.get())->view());
 
   // nan equal with null equal
@@ -274,14 +281,14 @@ TEST_F(CollectTest, CollectStrings)
 
   // collect_list including nulls
   auto const ret1 = cudf::reduce(
-    s_col, make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+    s_col, *make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(s_col, dynamic_cast<list_scalar*>(ret1.get())->view());
 
   // collect_list excluding nulls
   auto const expected2 = str_col{"a", "a", "b", "b", "c", "d", "e", "e"};
   auto const ret2 =
     cudf::reduce(s_col,
-                 make_collect_list_aggregation<reduce_aggregation>(null_policy::EXCLUDE),
+                 *make_collect_list_aggregation<reduce_aggregation>(null_policy::EXCLUDE),
                  data_type{type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, dynamic_cast<list_scalar*>(ret2.get())->view());
 
@@ -309,7 +316,7 @@ TEST_F(CollectTest, CollectStrings)
   auto const expected5 = str_col{{"a", "a", "b", "b", "null", "c", "null", "d", "null", "e"},
                                  {1, 1, 1, 1, 0, 1, 0, 1, 0, 1}};
   auto const ret5      = cudf::reduce(
-    strings, make_merge_lists_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+    strings, *make_merge_lists_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected5, dynamic_cast<list_scalar*>(ret5.get())->view());
 
   // merge_sets with null_equal
@@ -332,7 +339,7 @@ TEST_F(CollectTest, CollectEmptys)
   // test collect empty columns
   auto empty = int_col{};
   auto ret   = cudf::reduce(
-    empty, make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+    empty, *make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(int_col{}, dynamic_cast<list_scalar*>(ret.get())->view());
 
   ret = collect_set(empty, make_collect_set_aggregation<reduce_aggregation>());
@@ -341,7 +348,7 @@ TEST_F(CollectTest, CollectEmptys)
   // test collect all null columns
   auto all_nulls = int_col{{1, 2, 3, 4, 5}, {0, 0, 0, 0, 0}};
   ret            = cudf::reduce(
-    all_nulls, make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
+    all_nulls, *make_collect_list_aggregation<reduce_aggregation>(), data_type{type_id::LIST});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(int_col{}, dynamic_cast<list_scalar*>(ret.get())->view());
 
   ret = collect_set(all_nulls, make_collect_set_aggregation<reduce_aggregation>());
diff --git a/cpp/tests/reductions/list_rank_test.cpp b/cpp/tests/reductions/list_rank_test.cpp
index 1409ccd1311..5f3ab1636ef 100644
--- a/cpp/tests/reductions/list_rank_test.cpp
+++ b/cpp/tests/reductions/list_rank_test.cpp
@@ -27,7 +27,7 @@
 struct ListRankScanTest : public cudf::test::BaseFixture {
   inline void test_ungrouped_rank_scan(cudf::column_view const& input,
                                        cudf::column_view const& expect_vals,
-                                       std::unique_ptr<cudf::scan_aggregation> const& agg,
+                                       cudf::scan_aggregation const& agg,
                                        cudf::null_policy null_handling)
   {
     auto col_out = cudf::scan(input, agg, cudf::scan_type::INCLUSIVE, null_handling);
@@ -46,7 +46,7 @@ TEST_F(ListRankScanTest, BasicList)
   this->test_ungrouped_rank_scan(
     col,
     expected_dense_vals,
-    cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+    *cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
     cudf::null_policy::INCLUDE);
 }
 
@@ -78,7 +78,7 @@ TEST_F(ListRankScanTest, DeepList)
     this->test_ungrouped_rank_scan(
       col,
       expected_dense_vals,
-      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      *cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
       cudf::null_policy::INCLUDE);
   }
 
@@ -89,7 +89,7 @@ TEST_F(ListRankScanTest, DeepList)
     this->test_ungrouped_rank_scan(
       sliced_col,
       expected_dense_vals,
-      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      *cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
       cudf::null_policy::INCLUDE);
   }
 }
@@ -145,7 +145,7 @@ TEST_F(ListRankScanTest, ListOfStruct)
     this->test_ungrouped_rank_scan(
       list_column,
       expect,
-      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      *cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
       cudf::null_policy::INCLUDE);
   }
 
@@ -157,7 +157,7 @@ TEST_F(ListRankScanTest, ListOfStruct)
     this->test_ungrouped_rank_scan(
       sliced_col,
       expect,
-      cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+      *cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
       cudf::null_policy::INCLUDE);
   }
 }
@@ -201,7 +201,7 @@ TEST_F(ListRankScanTest, ListOfEmptyStruct)
   this->test_ungrouped_rank_scan(
     *list_column,
     expect,
-    cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+    *cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
     cudf::null_policy::INCLUDE);
 }
 
@@ -231,6 +231,6 @@ TEST_F(ListRankScanTest, EmptyDeepList)
   this->test_ungrouped_rank_scan(
     *list_column,
     expect,
-    cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
+    *cudf::make_rank_aggregation<cudf::scan_aggregation>(cudf::rank_method::DENSE),
     cudf::null_policy::INCLUDE);
 }
diff --git a/cpp/tests/reductions/rank_tests.cpp b/cpp/tests/reductions/rank_tests.cpp
index 5e90e5cfed8..a8e75aeb7e5 100644
--- a/cpp/tests/reductions/rank_tests.cpp
+++ b/cpp/tests/reductions/rank_tests.cpp
@@ -52,7 +52,7 @@ template <typename T>
 struct TypedRankScanTest : BaseScanTest<T> {
   inline void test_ungrouped_rank_scan(cudf::column_view const& input,
                                        cudf::column_view const& expect_vals,
-                                       std::unique_ptr<scan_aggregation> const& agg)
+                                       scan_aggregation const& agg)
   {
     auto col_out = cudf::scan(input, agg, INCLUSIVE_SCAN, INCLUDE_NULLS);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expect_vals, col_out->view());
@@ -90,9 +90,9 @@ TYPED_TEST(TypedRankScanTest, Rank)
                                                    6.0 / 11,
                                                    10.0 / 11,
                                                    11.0 / 11};
-  this->test_ungrouped_rank_scan(*col, expected_dense, dense_rank);
-  this->test_ungrouped_rank_scan(*col, expected_rank, rank);
-  this->test_ungrouped_rank_scan(*col, expected_percent, percent_rank);
+  this->test_ungrouped_rank_scan(*col, expected_dense, *dense_rank);
+  this->test_ungrouped_rank_scan(*col, expected_rank, *rank);
+  this->test_ungrouped_rank_scan(*col, expected_percent, *percent_rank);
 }
 
 TYPED_TEST(TypedRankScanTest, RankWithNulls)
@@ -120,9 +120,9 @@ TYPED_TEST(TypedRankScanTest, RankWithNulls)
                                                    8.0 / 11,
                                                    10.0 / 11,
                                                    11.0 / 11};
-  this->test_ungrouped_rank_scan(*col, expected_dense, dense_rank);
-  this->test_ungrouped_rank_scan(*col, expected_rank, rank);
-  this->test_ungrouped_rank_scan(*col, expected_percent, percent_rank);
+  this->test_ungrouped_rank_scan(*col, expected_dense, *dense_rank);
+  this->test_ungrouped_rank_scan(*col, expected_rank, *rank);
+  this->test_ungrouped_rank_scan(*col, expected_percent, *percent_rank);
 }
 
 namespace {
@@ -172,9 +172,9 @@ TYPED_TEST(TypedRankScanTest, MixedStructs)
                                                    9.0 / 11,
                                                    11.0 / 11};
 
-  this->test_ungrouped_rank_scan(struct_col, expected_dense, dense_rank);
-  this->test_ungrouped_rank_scan(struct_col, expected_rank, rank);
-  this->test_ungrouped_rank_scan(struct_col, expected_percent, percent_rank);
+  this->test_ungrouped_rank_scan(struct_col, expected_dense, *dense_rank);
+  this->test_ungrouped_rank_scan(struct_col, expected_rank, *rank);
+  this->test_ungrouped_rank_scan(struct_col, expected_percent, *percent_rank);
 }
 
 TYPED_TEST(TypedRankScanTest, NestedStructs)
@@ -196,16 +196,16 @@ TYPED_TEST(TypedRankScanTest, NestedStructs)
     return structs_column_wrapper{{col, strings_col, nuther_col}};
   }();
 
-  auto const dense_out      = cudf::scan(nested_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-  auto const dense_expected = cudf::scan(flat_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const dense_out      = cudf::scan(nested_col, *dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const dense_expected = cudf::scan(flat_col, *dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), dense_expected->view());
 
-  auto const rank_out      = cudf::scan(nested_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-  auto const rank_expected = cudf::scan(flat_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const rank_out      = cudf::scan(nested_col, *rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const rank_expected = cudf::scan(flat_col, *rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), rank_expected->view());
 
-  auto const percent_out      = cudf::scan(nested_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-  auto const percent_expected = cudf::scan(flat_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const percent_out = cudf::scan(nested_col, *percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const percent_expected = cudf::scan(flat_col, *percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), percent_expected->view());
 }
 
@@ -220,9 +220,9 @@ TYPED_TEST(TypedRankScanTest, StructsWithNullPushdown)
     auto const expected_null_result = rank_result_col{1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1};
     auto const expected_percent_rank_null_result =
       percent_result_col{0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0};
-    auto const dense_out   = cudf::scan(*struct_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-    auto const rank_out    = cudf::scan(*struct_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-    auto const percent_out = cudf::scan(*struct_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    auto const dense_out   = cudf::scan(*struct_col, *dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    auto const rank_out    = cudf::scan(*struct_col, *rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    auto const percent_out = cudf::scan(*struct_col, *percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_null_result);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_null_result);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), expected_percent_rank_null_result);
@@ -248,9 +248,9 @@ TYPED_TEST(TypedRankScanTest, StructsWithNullPushdown)
                                                      9.0 / 11,
                                                      9.0 / 11,
                                                      11.0 / 11};
-    auto const dense_out   = cudf::scan(*struct_col, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-    auto const rank_out    = cudf::scan(*struct_col, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-    auto const percent_out = cudf::scan(*struct_col, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    auto const dense_out   = cudf::scan(*struct_col, *dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    auto const rank_out    = cudf::scan(*struct_col, *rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+    auto const percent_out = cudf::scan(*struct_col, *percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(dense_out->view(), expected_dense);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(rank_out->view(), expected_rank);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(percent_out->view(), expected_percent);
@@ -278,9 +278,9 @@ TEST(RankScanTest, BoolRank)
                                                    3.0 / 11,
                                                    3.0 / 11};
 
-  auto const dense_out   = cudf::scan(vals, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-  auto const rank_out    = cudf::scan(vals, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-  auto const percent_out = cudf::scan(vals, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const dense_out   = cudf::scan(vals, *dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const rank_out    = cudf::scan(vals, *rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto const percent_out = cudf::scan(vals, *percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense, dense_out->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank, rank_out->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_percent, percent_out->view());
@@ -304,9 +304,9 @@ TEST(RankScanTest, BoolRankWithNull)
                                                    8.0 / 11,
                                                    8.0 / 11};
 
-  auto nullable_dense_out   = cudf::scan(vals, dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-  auto nullable_rank_out    = cudf::scan(vals, rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
-  auto nullable_percent_out = cudf::scan(vals, percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto nullable_dense_out   = cudf::scan(vals, *dense_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto nullable_rank_out    = cudf::scan(vals, *rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
+  auto nullable_percent_out = cudf::scan(vals, *percent_rank, INCLUSIVE_SCAN, INCLUDE_NULLS);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_dense, nullable_dense_out->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_rank, nullable_rank_out->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_percent, nullable_percent_out->view());
@@ -316,12 +316,12 @@ TEST(RankScanTest, ExclusiveScan)
 {
   auto const vals = input<uint32_t>{3, 4, 5};
 
-  CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, dense_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
-                            "Rank aggregation operator requires an inclusive scan");
-  CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
-                            "Rank aggregation operator requires an inclusive scan");
-  CUDF_EXPECT_THROW_MESSAGE(cudf::scan(vals, percent_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
-                            "Rank aggregation operator requires an inclusive scan");
+  // Only inclusive scans are supported, so these should all raise exceptions.
+  EXPECT_THROW(cudf::scan(vals, *dense_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
+               cudf::logic_error);
+  EXPECT_THROW(cudf::scan(vals, *rank, scan_type::EXCLUSIVE, INCLUDE_NULLS), cudf::logic_error);
+  EXPECT_THROW(cudf::scan(vals, *percent_rank, scan_type::EXCLUSIVE, INCLUDE_NULLS),
+               cudf::logic_error);
 }
 
 }  // namespace cudf::test
diff --git a/cpp/tests/reductions/reduction_tests.cpp b/cpp/tests/reductions/reduction_tests.cpp
index 7f04d3edb14..4bfecdfeb01 100644
--- a/cpp/tests/reductions/reduction_tests.cpp
+++ b/cpp/tests/reductions/reduction_tests.cpp
@@ -94,7 +94,7 @@ struct ReductionTest : public cudf::test::BaseFixture {
 
   template <typename T_out>
   std::pair<T_out, bool> reduction_test(cudf::column_view const& underlying_column,
-                                        std::unique_ptr<reduce_aggregation> const& agg,
+                                        reduce_aggregation const& agg,
                                         std::optional<cudf::data_type> _output_dtype = {})
   {
     auto const output_dtype                 = _output_dtype.value_or(underlying_column.type());
@@ -108,7 +108,7 @@ struct ReductionTest : public cudf::test::BaseFixture {
   template <typename T_out>
   std::pair<T_out, bool> reduction_test(cudf::column_view const& underlying_column,
                                         cudf::scalar const& initial_value,
-                                        std::unique_ptr<reduce_aggregation> const& agg,
+                                        reduce_aggregation const& agg,
                                         std::optional<cudf::data_type> _output_dtype = {})
   {
     auto const output_dtype = _output_dtype.value_or(underlying_column.type());
@@ -152,19 +152,19 @@ TYPED_TEST(MinMaxReductionTest, MinMaxTypes)
     v.begin(), v.end(), init_value, [](const T& a, const T& b) { return std::max<T>(a, b); });
 
   EXPECT_EQ(
-    this->template reduction_test<T>(col, cudf::make_min_aggregation<reduce_aggregation>()).first,
+    this->template reduction_test<T>(col, *cudf::make_min_aggregation<reduce_aggregation>()).first,
     expected_min_result);
   EXPECT_EQ(
-    this->template reduction_test<T>(col, cudf::make_max_aggregation<reduce_aggregation>()).first,
+    this->template reduction_test<T>(col, *cudf::make_max_aggregation<reduce_aggregation>()).first,
     expected_max_result);
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col, *init_scalar, cudf::make_min_aggregation<reduce_aggregation>())
+                col, *init_scalar, *cudf::make_min_aggregation<reduce_aggregation>())
               .first,
             expected_min_init_result);
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col, *init_scalar, cudf::make_max_aggregation<reduce_aggregation>())
+                col, *init_scalar, *cudf::make_max_aggregation<reduce_aggregation>())
               .first,
             expected_max_init_result);
 
@@ -194,21 +194,21 @@ TYPED_TEST(MinMaxReductionTest, MinMaxTypes)
     });
 
   EXPECT_EQ(
-    this->template reduction_test<T>(col_nulls, cudf::make_min_aggregation<reduce_aggregation>())
+    this->template reduction_test<T>(col_nulls, *cudf::make_min_aggregation<reduce_aggregation>())
       .first,
     expected_min_null_result);
   EXPECT_EQ(
-    this->template reduction_test<T>(col_nulls, cudf::make_max_aggregation<reduce_aggregation>())
+    this->template reduction_test<T>(col_nulls, *cudf::make_max_aggregation<reduce_aggregation>())
       .first,
     expected_max_null_result);
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col_nulls, *init_scalar, cudf::make_min_aggregation<reduce_aggregation>())
+                col_nulls, *init_scalar, *cudf::make_min_aggregation<reduce_aggregation>())
               .first,
             expected_min_init_null_result);
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col_nulls, *init_scalar, cudf::make_max_aggregation<reduce_aggregation>())
+                col_nulls, *init_scalar, *cudf::make_max_aggregation<reduce_aggregation>())
               .first,
             expected_max_init_null_result);
 
@@ -226,19 +226,19 @@ TYPED_TEST(MinMaxReductionTest, MinMaxTypes)
 
   EXPECT_FALSE(
     this
-      ->template reduction_test<T>(col_all_nulls, cudf::make_min_aggregation<reduce_aggregation>())
+      ->template reduction_test<T>(col_all_nulls, *cudf::make_min_aggregation<reduce_aggregation>())
       .second);
   EXPECT_FALSE(
     this
-      ->template reduction_test<T>(col_all_nulls, cudf::make_max_aggregation<reduce_aggregation>())
+      ->template reduction_test<T>(col_all_nulls, *cudf::make_max_aggregation<reduce_aggregation>())
       .second);
   EXPECT_FALSE(this
                  ->template reduction_test<T>(
-                   col_all_nulls, *init_scalar, cudf::make_min_aggregation<reduce_aggregation>())
+                   col_all_nulls, *init_scalar, *cudf::make_min_aggregation<reduce_aggregation>())
                  .second);
   EXPECT_FALSE(this
                  ->template reduction_test<T>(
-                   col_all_nulls, *init_scalar, cudf::make_max_aggregation<reduce_aggregation>())
+                   col_all_nulls, *init_scalar, *cudf::make_max_aggregation<reduce_aggregation>())
                  .second);
 
   auto all_null_res = cudf::minmax(col_all_nulls);
@@ -271,11 +271,11 @@ TYPED_TEST(SumReductionTest, Sum)
   T expected_value_init = std::accumulate(v.begin(), v.end(), init_value);
 
   EXPECT_EQ(
-    this->template reduction_test<T>(col, cudf::make_sum_aggregation<reduce_aggregation>()).first,
+    this->template reduction_test<T>(col, *cudf::make_sum_aggregation<reduce_aggregation>()).first,
     expected_value);
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col, *init_scalar, cudf::make_sum_aggregation<reduce_aggregation>())
+                col, *init_scalar, *cudf::make_sum_aggregation<reduce_aggregation>())
               .first,
             expected_value_init);
 
@@ -286,12 +286,12 @@ TYPED_TEST(SumReductionTest, Sum)
   init_scalar->set_valid_async(false);
 
   EXPECT_EQ(
-    this->template reduction_test<T>(col_nulls, cudf::make_sum_aggregation<reduce_aggregation>())
+    this->template reduction_test<T>(col_nulls, *cudf::make_sum_aggregation<reduce_aggregation>())
       .first,
     expected_null_value);
   EXPECT_FALSE(this
                  ->template reduction_test<T>(
-                   col_nulls, *init_scalar, cudf::make_sum_aggregation<reduce_aggregation>())
+                   col_nulls, *init_scalar, *cudf::make_sum_aggregation<reduce_aggregation>())
                  .second);
 }
 
@@ -323,12 +323,12 @@ TYPED_TEST(ReductionTest, Product)
   TypeParam expected_value_init = calc_prod_init(v, init_value);
 
   EXPECT_EQ(
-    this->template reduction_test<T>(col, cudf::make_product_aggregation<reduce_aggregation>())
+    this->template reduction_test<T>(col, *cudf::make_product_aggregation<reduce_aggregation>())
       .first,
     expected_value);
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col, *init_scalar, cudf::make_product_aggregation<reduce_aggregation>())
+                col, *init_scalar, *cudf::make_product_aggregation<reduce_aggregation>())
               .first,
             expected_value_init);
 
@@ -340,12 +340,12 @@ TYPED_TEST(ReductionTest, Product)
 
   EXPECT_EQ(
     this
-      ->template reduction_test<T>(col_nulls, cudf::make_product_aggregation<reduce_aggregation>())
+      ->template reduction_test<T>(col_nulls, *cudf::make_product_aggregation<reduce_aggregation>())
       .first,
     expected_null_value);
   EXPECT_FALSE(this
                  ->template reduction_test<T>(
-                   col_nulls, *init_scalar, cudf::make_product_aggregation<reduce_aggregation>())
+                   col_nulls, *init_scalar, *cudf::make_product_aggregation<reduce_aggregation>())
                  .second);
 }
 
@@ -365,11 +365,11 @@ TYPED_TEST(ReductionTest, SumOfSquare)
   cudf::test::fixed_width_column_wrapper<T> col(v.begin(), v.end());
   T expected_value = calc_reduction(v);
 
-  EXPECT_EQ(
-    this
-      ->template reduction_test<T>(col, cudf::make_sum_of_squares_aggregation<reduce_aggregation>())
-      .first,
-    expected_value);
+  EXPECT_EQ(this
+              ->template reduction_test<T>(
+                col, *cudf::make_sum_of_squares_aggregation<reduce_aggregation>())
+              .first,
+            expected_value);
 
   // test with nulls
   cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(v, host_bools);
@@ -378,7 +378,7 @@ TYPED_TEST(ReductionTest, SumOfSquare)
 
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col_nulls, cudf::make_sum_of_squares_aggregation<reduce_aggregation>())
+                col_nulls, *cudf::make_sum_of_squares_aggregation<reduce_aggregation>())
               .first,
             expected_null_value);
 }
@@ -407,22 +407,22 @@ TYPED_TEST(ReductionAnyAllTest, AnyAllTrueTrue)
 
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col, *init_scalar, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                col, *init_scalar, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col, *init_scalar, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                col, *init_scalar, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
 
@@ -432,23 +432,23 @@ TYPED_TEST(ReductionAnyAllTest, AnyAllTrueTrue)
 
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col_nulls, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                col_nulls, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col_nulls, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                col_nulls, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
   EXPECT_FALSE(
     this
       ->template reduction_test<bool>(
-        col_nulls, *init_scalar, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+        col_nulls, *init_scalar, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
       .second);
   EXPECT_FALSE(
     this
       ->template reduction_test<bool>(
-        col_nulls, *init_scalar, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+        col_nulls, *init_scalar, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
       .second);
 }
 
@@ -470,22 +470,22 @@ TYPED_TEST(ReductionAnyAllTest, AnyAllFalseFalse)
 
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col, *init_scalar, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                col, *init_scalar, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col, *init_scalar, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                col, *init_scalar, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
 
@@ -495,23 +495,23 @@ TYPED_TEST(ReductionAnyAllTest, AnyAllFalseFalse)
 
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col_nulls, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                col_nulls, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
   EXPECT_EQ(this
               ->template reduction_test<bool>(
-                col_nulls, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                col_nulls, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
               .first,
             expected);
   EXPECT_FALSE(
     this
       ->template reduction_test<bool>(
-        col_nulls, *init_scalar, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+        col_nulls, *init_scalar, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
       .second);
   EXPECT_FALSE(
     this
       ->template reduction_test<bool>(
-        col_nulls, *init_scalar, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+        col_nulls, *init_scalar, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
       .second);
 }
 
@@ -541,7 +541,7 @@ TYPED_TEST(MultiStepReductionTest, Mean)
 
   EXPECT_EQ(this
               ->template reduction_test<double>(col,
-                                                cudf::make_mean_aggregation<reduce_aggregation>(),
+                                                *cudf::make_mean_aggregation<reduce_aggregation>(),
                                                 cudf::data_type(cudf::type_id::FLOAT64))
               .first,
             expected_value);
@@ -556,7 +556,7 @@ TYPED_TEST(MultiStepReductionTest, Mean)
 
   EXPECT_EQ(this
               ->template reduction_test<double>(col_nulls,
-                                                cudf::make_mean_aggregation<reduce_aggregation>(),
+                                                *cudf::make_mean_aggregation<reduce_aggregation>(),
                                                 cudf::data_type(cudf::type_id::FLOAT64))
               .first,
             expected_value_nulls);
@@ -598,11 +598,11 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
   auto std_agg    = cudf::make_std_aggregation<reduce_aggregation>(ddof);
 
   EXPECT_EQ(
-    this->template reduction_test<double>(col, var_agg, cudf::data_type(cudf::type_id::FLOAT64))
+    this->template reduction_test<double>(col, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
       .first,
     var);
   EXPECT_EQ(
-    this->template reduction_test<double>(col, std_agg, cudf::data_type(cudf::type_id::FLOAT64))
+    this->template reduction_test<double>(col, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
       .first,
     std);
 
@@ -615,16 +615,16 @@ TYPED_TEST(MultiStepReductionTest, DISABLED_var_std)
   double var_nulls = calc_var(replaced_array, valid_count, ddof);
   double std_nulls = std::sqrt(var_nulls);
 
-  EXPECT_EQ(
-    this
-      ->template reduction_test<double>(col_nulls, var_agg, cudf::data_type(cudf::type_id::FLOAT64))
-      .first,
-    var_nulls);
-  EXPECT_EQ(
-    this
-      ->template reduction_test<double>(col_nulls, std_agg, cudf::data_type(cudf::type_id::FLOAT64))
-      .first,
-    std_nulls);
+  EXPECT_EQ(this
+              ->template reduction_test<double>(
+                col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
+              .first,
+            var_nulls);
+  EXPECT_EQ(this
+              ->template reduction_test<double>(
+                col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
+              .first,
+            std_nulls);
 }
 
 // ----------------------------------------------------------------------------
@@ -633,7 +633,7 @@ template <typename T>
 struct ReductionMultiStepErrorCheck : public ReductionTest<T> {
   void reduction_error_check(cudf::test::fixed_width_column_wrapper<T>& col,
                              bool succeeded_condition,
-                             std::unique_ptr<reduce_aggregation> const& agg,
+                             reduce_aggregation const& agg,
                              cudf::data_type output_dtype)
   {
     const cudf::column_view underlying_column = col;
@@ -685,14 +685,14 @@ TYPED_TEST(ReductionMultiStepErrorCheck, DISABLED_ErrorHandling)
     auto var_agg        = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
     auto std_agg        = cudf::make_std_aggregation<reduce_aggregation>(ddof);
     this->reduction_error_check(
-      col, expect_succeed, cudf::make_mean_aggregation<reduce_aggregation>(), dtype);
-    this->reduction_error_check(col, expect_succeed, var_agg, dtype);
-    this->reduction_error_check(col, expect_succeed, std_agg, dtype);
+      col, expect_succeed, *cudf::make_mean_aggregation<reduce_aggregation>(), dtype);
+    this->reduction_error_check(col, expect_succeed, *var_agg, dtype);
+    this->reduction_error_check(col, expect_succeed, *std_agg, dtype);
 
     this->reduction_error_check(
-      col_nulls, expect_succeed, cudf::make_mean_aggregation<reduce_aggregation>(), dtype);
-    this->reduction_error_check(col_nulls, expect_succeed, var_agg, dtype);
-    this->reduction_error_check(col_nulls, expect_succeed, std_agg, dtype);
+      col_nulls, expect_succeed, *cudf::make_mean_aggregation<reduce_aggregation>(), dtype);
+    this->reduction_error_check(col_nulls, expect_succeed, *var_agg, dtype);
+    this->reduction_error_check(col_nulls, expect_succeed, *std_agg, dtype);
     return;
   };
 
@@ -706,7 +706,7 @@ struct ReductionDtypeTest : public cudf::test::BaseFixture {
   void reduction_test(std::vector<int>& int_values,
                       T_out expected_value,
                       bool succeeded_condition,
-                      std::unique_ptr<reduce_aggregation> const& agg,
+                      reduce_aggregation const& agg,
                       cudf::data_type out_dtype,
                       bool expected_overflow = false)
   {
@@ -739,7 +739,7 @@ TEST_F(ReductionDtypeTest, all_null_output)
     cudf::test::fixed_point_column_wrapper<int32_t>{{0, 0, 0}, {0, 0, 0}, numeric::scale_type{-2}}
       .release();
 
-  std::unique_ptr<cudf::scalar> result = cudf::reduce(*col, sum_agg, col->type());
+  std::unique_ptr<cudf::scalar> result = cudf::reduce(*col, *sum_agg, col->type());
   EXPECT_EQ(result->is_valid(), false);
   EXPECT_EQ(result->type().id(), col->type().id());
   EXPECT_EQ(result->type().scale(), col->type().scale());
@@ -757,27 +757,27 @@ TEST_F(ReductionDtypeTest, different_precision)
   this->reduction_test<int8_t, int8_t>(int_values,
                                        static_cast<int8_t>(expected_value),
                                        true,
-                                       sum_agg,
+                                       *sum_agg,
                                        cudf::data_type(cudf::type_id::INT8),
                                        expected_overflow);
 
   this->reduction_test<int8_t, int64_t>(int_values,
                                         static_cast<int64_t>(expected_value),
                                         true,
-                                        sum_agg,
+                                        *sum_agg,
                                         cudf::data_type(cudf::type_id::INT64));
 
   this->reduction_test<int8_t, double>(int_values,
                                        static_cast<double>(expected_value),
                                        true,
-                                       sum_agg,
+                                       *sum_agg,
                                        cudf::data_type(cudf::type_id::FLOAT64));
 
   // down cast (over flow)
   this->reduction_test<double, int8_t>(int_values,
                                        static_cast<int8_t>(expected_value),
                                        true,
-                                       sum_agg,
+                                       *sum_agg,
                                        cudf::data_type(cudf::type_id::INT8),
                                        expected_overflow);
 
@@ -785,7 +785,7 @@ TEST_F(ReductionDtypeTest, different_precision)
   this->reduction_test<double, int16_t>(int_values,
                                         static_cast<int16_t>(expected_value),
                                         true,
-                                        sum_agg,
+                                        *sum_agg,
                                         cudf::data_type(cudf::type_id::INT16));
 
   // not supported case:
@@ -794,21 +794,21 @@ TEST_F(ReductionDtypeTest, different_precision)
     int_values,
     cudf::timestamp_s{cudf::duration_s(expected_value)},
     false,
-    sum_agg,
+    *sum_agg,
     cudf::data_type(cudf::type_id::TIMESTAMP_SECONDS));
 
   this->reduction_test<cudf::timestamp_s, cudf::timestamp_ns>(
     int_values,
     cudf::timestamp_ns{cudf::duration_ns(expected_value)},
     false,
-    sum_agg,
+    *sum_agg,
     cudf::data_type(cudf::type_id::TIMESTAMP_NANOSECONDS));
 
   this->reduction_test<int8_t, cudf::timestamp_us>(
     int_values,
     cudf::timestamp_us{cudf::duration_us(expected_value)},
     false,
-    sum_agg,
+    *sum_agg,
     cudf::data_type(cudf::type_id::TIMESTAMP_MICROSECONDS));
 
   std::vector<bool> v = convert_values<bool>(int_values);
@@ -817,46 +817,49 @@ TEST_F(ReductionDtypeTest, different_precision)
   // it's an integer/float sum of ones and zeros.
   int expected = std::accumulate(v.begin(), v.end(), int{0});
 
-  this->reduction_test<bool, int8_t>(
-    int_values, static_cast<int8_t>(expected), true, sum_agg, cudf::data_type(cudf::type_id::INT8));
+  this->reduction_test<bool, int8_t>(int_values,
+                                     static_cast<int8_t>(expected),
+                                     true,
+                                     *sum_agg,
+                                     cudf::data_type(cudf::type_id::INT8));
   this->reduction_test<bool, int16_t>(int_values,
                                       static_cast<int16_t>(expected),
                                       true,
-                                      sum_agg,
+                                      *sum_agg,
                                       cudf::data_type(cudf::type_id::INT16));
   this->reduction_test<bool, int32_t>(int_values,
                                       static_cast<int32_t>(expected),
                                       true,
-                                      sum_agg,
+                                      *sum_agg,
                                       cudf::data_type(cudf::type_id::INT32));
   this->reduction_test<bool, int64_t>(int_values,
                                       static_cast<int64_t>(expected),
                                       true,
-                                      sum_agg,
+                                      *sum_agg,
                                       cudf::data_type(cudf::type_id::INT64));
   this->reduction_test<bool, float>(int_values,
                                     static_cast<float>(expected),
                                     true,
-                                    sum_agg,
+                                    *sum_agg,
                                     cudf::data_type(cudf::type_id::FLOAT32));
   this->reduction_test<bool, double>(int_values,
                                      static_cast<double>(expected),
                                      true,
-                                     sum_agg,
+                                     *sum_agg,
                                      cudf::data_type(cudf::type_id::FLOAT64));
 
   // make sure boolean arithmetic semantics are obeyed when reducing to a bool
   this->reduction_test<bool, bool>(
-    int_values, true, true, sum_agg, cudf::data_type(cudf::type_id::BOOL8));
+    int_values, true, true, *sum_agg, cudf::data_type(cudf::type_id::BOOL8));
 
   this->reduction_test<int32_t, bool>(
-    int_values, true, true, sum_agg, cudf::data_type(cudf::type_id::BOOL8));
+    int_values, true, true, *sum_agg, cudf::data_type(cudf::type_id::BOOL8));
 
   // cudf::timestamp_s and int64_t are not convertible types.
   this->reduction_test<cudf::timestamp_s, int64_t>(int_values,
                                                    static_cast<int64_t>(expected_value),
                                                    false,
-                                                   sum_agg,
+                                                   *sum_agg,
                                                    cudf::data_type(cudf::type_id::INT64));
 }
 
@@ -868,8 +871,10 @@ TEST_F(ReductionErrorTest, empty_column)
 {
   using T        = int32_t;
   auto statement = [](cudf::column_view const& col) {
-    std::unique_ptr<cudf::scalar> result = cudf::reduce(
-      col, cudf::make_sum_aggregation<reduce_aggregation>(), cudf::data_type(cudf::type_id::INT64));
+    std::unique_ptr<cudf::scalar> result =
+      cudf::reduce(col,
+                   *cudf::make_sum_aggregation<reduce_aggregation>(),
+                   cudf::data_type(cudf::type_id::INT64));
     EXPECT_EQ(result->is_valid(), false);
   };
 
@@ -935,11 +940,11 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
   auto std_agg = cudf::make_std_aggregation<reduce_aggregation>(ddof);
 
   EXPECT_EQ(
-    this->template reduction_test<double>(col, var_agg, cudf::data_type(cudf::type_id::FLOAT64))
+    this->template reduction_test<double>(col, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
       .first,
     var);
   EXPECT_EQ(
-    this->template reduction_test<double>(col, std_agg, cudf::data_type(cudf::type_id::FLOAT64))
+    this->template reduction_test<double>(col, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
       .first,
     std);
 
@@ -953,16 +958,16 @@ TEST_P(ReductionParamTest, DISABLED_std_var)
   double var_nulls = calc_var(replaced_array, valid_count);
   double std_nulls = std::sqrt(var_nulls);
 
-  EXPECT_EQ(
-    this
-      ->template reduction_test<double>(col_nulls, var_agg, cudf::data_type(cudf::type_id::FLOAT64))
-      .first,
-    var_nulls);
-  EXPECT_EQ(
-    this
-      ->template reduction_test<double>(col_nulls, std_agg, cudf::data_type(cudf::type_id::FLOAT64))
-      .first,
-    std_nulls);
+  EXPECT_EQ(this
+              ->template reduction_test<double>(
+                col_nulls, *var_agg, cudf::data_type(cudf::type_id::FLOAT64))
+              .first,
+            var_nulls);
+  EXPECT_EQ(this
+              ->template reduction_test<double>(
+                col_nulls, *std_agg, cudf::data_type(cudf::type_id::FLOAT64))
+              .first,
+            std_nulls);
 }
 
 //-------------------------------------------------------------------
@@ -973,7 +978,7 @@ struct StringReductionTest : public cudf::test::BaseFixture,
   void reduction_test(cudf::column_view const& underlying_column,
                       std::string expected_value,
                       bool succeeded_condition,
-                      std::unique_ptr<reduce_aggregation> const& agg,
+                      reduce_aggregation const& agg,
                       cudf::data_type output_dtype = cudf::data_type{})
   {
     if (cudf::data_type{} == output_dtype) output_dtype = underlying_column.type();
@@ -986,7 +991,7 @@ struct StringReductionTest : public cudf::test::BaseFixture,
       if (!result1->is_valid())
         std::cout << "expected=" << expected_value << ",got=" << result1->to_string() << std::endl;
       EXPECT_EQ(expected_value, result1->to_string())
-        << (agg->kind == aggregation::MIN ? "MIN" : "MAX");
+        << (agg.kind == aggregation::MIN ? "MIN" : "MAX");
     };
 
     if (succeeded_condition) {
@@ -1000,7 +1005,7 @@ struct StringReductionTest : public cudf::test::BaseFixture,
                       std::string initial_value,
                       std::string expected_value,
                       bool succeeded_condition,
-                      std::unique_ptr<reduce_aggregation> const& agg,
+                      reduce_aggregation const& agg,
                       cudf::data_type output_dtype = cudf::data_type{})
   {
     if (cudf::data_type{} == output_dtype) output_dtype = underlying_column.type();
@@ -1015,7 +1020,7 @@ struct StringReductionTest : public cudf::test::BaseFixture,
       if (!result1->is_valid())
         std::cout << "expected=" << expected_value << ",got=" << result1->to_string() << std::endl;
       EXPECT_EQ(expected_value, result1->to_string())
-        << (agg->kind == aggregation::MIN ? "MIN" : "MAX");
+        << (agg.kind == aggregation::MIN ? "MIN" : "MAX");
     };
 
     if (succeeded_condition) {
@@ -1072,34 +1077,38 @@ TEST_P(StringReductionTest, MinMax)
 
   // MIN
   this->reduction_test(
-    col, expected_min_result, succeed, cudf::make_min_aggregation<reduce_aggregation>());
-  this->reduction_test(
-    col_nulls, expected_min_null_result, succeed, cudf::make_min_aggregation<reduce_aggregation>());
+    col, expected_min_result, succeed, *cudf::make_min_aggregation<reduce_aggregation>());
+  this->reduction_test(col_nulls,
+                       expected_min_null_result,
+                       succeed,
+                       *cudf::make_min_aggregation<reduce_aggregation>());
   this->reduction_test(col,
                        initial_value,
                        expected_min_init_result,
                        succeed,
-                       cudf::make_min_aggregation<reduce_aggregation>());
+                       *cudf::make_min_aggregation<reduce_aggregation>());
   this->reduction_test(col_nulls,
                        initial_value,
                        expected_min_init_null_result,
                        succeed,
-                       cudf::make_min_aggregation<reduce_aggregation>());
+                       *cudf::make_min_aggregation<reduce_aggregation>());
   // MAX
   this->reduction_test(
-    col, expected_max_result, succeed, cudf::make_max_aggregation<reduce_aggregation>());
-  this->reduction_test(
-    col_nulls, expected_max_null_result, succeed, cudf::make_max_aggregation<reduce_aggregation>());
+    col, expected_max_result, succeed, *cudf::make_max_aggregation<reduce_aggregation>());
+  this->reduction_test(col_nulls,
+                       expected_max_null_result,
+                       succeed,
+                       *cudf::make_max_aggregation<reduce_aggregation>());
   this->reduction_test(col,
                        initial_value,
                        expected_max_init_result,
                        succeed,
-                       cudf::make_max_aggregation<reduce_aggregation>());
+                       *cudf::make_max_aggregation<reduce_aggregation>());
   this->reduction_test(col_nulls,
                        initial_value,
                        expected_max_init_null_result,
                        succeed,
-                       cudf::make_max_aggregation<reduce_aggregation>());
+                       *cudf::make_max_aggregation<reduce_aggregation>());
 
   // MINMAX
   auto result = cudf::minmax(col);
@@ -1176,16 +1185,16 @@ TEST_F(StringReductionTest, AllNull)
 
   // MIN
   auto result =
-    cudf::reduce(col_nulls, cudf::make_min_aggregation<reduce_aggregation>(), output_dtype);
+    cudf::reduce(col_nulls, *cudf::make_min_aggregation<reduce_aggregation>(), output_dtype);
   EXPECT_FALSE(result->is_valid());
   result = cudf::reduce(
-    col_nulls, cudf::make_min_aggregation<reduce_aggregation>(), output_dtype, *initial_value);
+    col_nulls, *cudf::make_min_aggregation<reduce_aggregation>(), output_dtype, *initial_value);
   EXPECT_FALSE(result->is_valid());
   // MAX
-  result = cudf::reduce(col_nulls, cudf::make_max_aggregation<reduce_aggregation>(), output_dtype);
+  result = cudf::reduce(col_nulls, *cudf::make_max_aggregation<reduce_aggregation>(), output_dtype);
   EXPECT_FALSE(result->is_valid());
   result = cudf::reduce(
-    col_nulls, cudf::make_max_aggregation<reduce_aggregation>(), output_dtype, *initial_value);
+    col_nulls, *cudf::make_max_aggregation<reduce_aggregation>(), output_dtype, *initial_value);
   EXPECT_FALSE(result->is_valid());
   // MINMAX
   auto mm_result = cudf::minmax(col_nulls);
@@ -1209,7 +1218,7 @@ TYPED_TEST(ReductionTest, Median)
     return 13.5;
   }();
   EXPECT_EQ(
-    this->template reduction_test<double>(col, cudf::make_median_aggregation<reduce_aggregation>())
+    this->template reduction_test<double>(col, *cudf::make_median_aggregation<reduce_aggregation>())
       .first,
     expected_value);
 
@@ -1220,8 +1229,8 @@ TYPED_TEST(ReductionTest, Median)
     return 14.0;
   }();
   EXPECT_EQ(this
-              ->template reduction_test<double>(col_odd,
-                                                cudf::make_median_aggregation<reduce_aggregation>())
+              ->template reduction_test<double>(
+                col_odd, *cudf::make_median_aggregation<reduce_aggregation>())
               .first,
             expected_value_odd);
 
@@ -1234,8 +1243,8 @@ TYPED_TEST(ReductionTest, Median)
   }();
 
   EXPECT_EQ(this
-              ->template reduction_test<double>(col_nulls,
-                                                cudf::make_median_aggregation<reduce_aggregation>())
+              ->template reduction_test<double>(
+                col_nulls, *cudf::make_median_aggregation<reduce_aggregation>())
               .first,
             expected_null_value);
 
@@ -1246,8 +1255,8 @@ TYPED_TEST(ReductionTest, Median)
     return 13.5;
   }();
   EXPECT_EQ(this
-              ->template reduction_test<double>(col_nulls_odd,
-                                                cudf::make_median_aggregation<reduce_aggregation>())
+              ->template reduction_test<double>(
+                col_nulls_odd, *cudf::make_median_aggregation<reduce_aggregation>())
               .first,
             expected_null_value_odd);
 }
@@ -1266,14 +1275,14 @@ TYPED_TEST(ReductionTest, Quantile)
   double expected_value0 = std::is_same_v<T, bool> || std::is_unsigned_v<T> ? v[4] : v[6];
   EXPECT_EQ(this
               ->template reduction_test<double>(
-                col, cudf::make_quantile_aggregation<reduce_aggregation>({0.0}, interp))
+                col, *cudf::make_quantile_aggregation<reduce_aggregation>({0.0}, interp))
               .first,
             expected_value0);
 
   double expected_value1 = v[3];
   EXPECT_EQ(this
               ->template reduction_test<double>(
-                col, cudf::make_quantile_aggregation<reduce_aggregation>({1.0}, interp))
+                col, *cudf::make_quantile_aggregation<reduce_aggregation>({1.0}, interp))
               .first,
             expected_value1);
 
@@ -1283,12 +1292,12 @@ TYPED_TEST(ReductionTest, Quantile)
 
   EXPECT_EQ(this
               ->template reduction_test<double>(
-                col_nulls, cudf::make_quantile_aggregation<reduce_aggregation>({0}, interp))
+                col_nulls, *cudf::make_quantile_aggregation<reduce_aggregation>({0}, interp))
               .first,
             expected_value0);
   EXPECT_EQ(this
               ->template reduction_test<double>(
-                col_nulls, cudf::make_quantile_aggregation<reduce_aggregation>({1}, interp))
+                col_nulls, *cudf::make_quantile_aggregation<reduce_aggregation>({1}, interp))
               .first,
             expected_null_value1);
 }
@@ -1303,16 +1312,18 @@ TYPED_TEST(ReductionTest, UniqueCount)
   // test without nulls
   cudf::test::fixed_width_column_wrapper<T> col(v.begin(), v.end());
   cudf::size_type expected_value = std::is_same_v<T, bool> ? 2 : 6;
-  EXPECT_EQ(this
-              ->template reduction_test<cudf::size_type>(
-                col, cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::INCLUDE))
-              .first,
-            expected_value);
-  EXPECT_EQ(this
-              ->template reduction_test<cudf::size_type>(
-                col, cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::EXCLUDE))
-              .first,
-            expected_value);
+  EXPECT_EQ(
+    this
+      ->template reduction_test<cudf::size_type>(
+        col, *cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::INCLUDE))
+      .first,
+    expected_value);
+  EXPECT_EQ(
+    this
+      ->template reduction_test<cudf::size_type>(
+        col, *cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::EXCLUDE))
+      .first,
+    expected_value);
 
   // test with nulls
   cudf::test::fixed_width_column_wrapper<T> col_nulls = construct_null_column(v, host_bools);
@@ -1322,13 +1333,13 @@ TYPED_TEST(ReductionTest, UniqueCount)
   EXPECT_EQ(
     this
       ->template reduction_test<cudf::size_type>(
-        col_nulls, cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::INCLUDE))
+        col_nulls, *cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::INCLUDE))
       .first,
     expected_null_value0);
   EXPECT_EQ(
     this
       ->template reduction_test<cudf::size_type>(
-        col_nulls, cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::EXCLUDE))
+        col_nulls, *cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::EXCLUDE))
       .first,
     expected_null_value1);
 }
@@ -1357,7 +1368,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProductZeroScale)
   auto const out_type = static_cast<cudf::column_view>(column).type();
 
   auto const result =
-    cudf::reduce(column, cudf::make_product_aggregation<reduce_aggregation>(), out_type);
+    cudf::reduce(column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type);
   auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
   auto const result_fp     = decimalXX{result_scalar->value()};
 
@@ -1370,7 +1381,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProductZeroScale)
   auto const init_scalar = cudf::make_fixed_point_scalar<decimalXX>(2, scale_type{0});
 
   auto const init_result = cudf::reduce(
-    column, cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+    column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
   auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
   auto const init_result_fp     = decimalXX{init_result_scalar->value()};
 
@@ -1392,7 +1403,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProduct)
     auto const expected = decimalXX{scaled_integer<RepType>{36, scale_type{i * 6}}};
 
     auto const result =
-      cudf::reduce(column, cudf::make_product_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1402,7 +1413,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProduct)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimalXX>(2, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
 
     EXPECT_EQ(init_result_scalar->fixed_point_value(), init_expected);
@@ -1423,7 +1434,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProductWithNulls)
     auto const expected = decimalXX{scaled_integer<RepType>{6, scale_type{i * 3}}};
 
     auto const result =
-      cudf::reduce(column, cudf::make_product_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1433,7 +1444,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionProductWithNulls)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimalXX>(2, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
 
     EXPECT_EQ(init_result_scalar->fixed_point_value(), init_expected);
@@ -1455,7 +1466,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSum)
     auto const out_type = static_cast<cudf::column_view>(column).type();
 
     auto const result =
-      cudf::reduce(column, cudf::make_sum_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_sum_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1465,7 +1476,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSum)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimalXX>(2, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_sum_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_sum_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
 
     EXPECT_EQ(init_result_scalar->fixed_point_value(), init_expected);
@@ -1491,7 +1502,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumAlternate)
   auto const out_type = static_cast<cudf::column_view>(column).type();
 
   auto const result =
-    cudf::reduce(column, cudf::make_sum_aggregation<reduce_aggregation>(), out_type);
+    cudf::reduce(column, *cudf::make_sum_aggregation<reduce_aggregation>(), out_type);
   auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
   EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1502,7 +1513,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumAlternate)
   auto const init_scalar   = cudf::make_fixed_point_scalar<decimalXX>(2, scale_type{0});
 
   auto const init_result =
-    cudf::reduce(column, cudf::make_sum_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+    cudf::reduce(column, *cudf::make_sum_aggregation<reduce_aggregation>(), out_type, *init_scalar);
   auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
 
   EXPECT_EQ(init_result_scalar->fixed_point_value(), init_expected);
@@ -1523,7 +1534,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumFractional)
     auto const expected = decimalXX{scaled_integer<RepType>{666, scale}};
 
     auto const result =
-      cudf::reduce(column, cudf::make_sum_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_sum_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1533,7 +1544,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumFractional)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimalXX>(2, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_sum_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_sum_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
 
     EXPECT_EQ(init_result_scalar->fixed_point_value(), init_expected);
@@ -1557,7 +1568,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumLarge)
     auto const expected       = decimalXX{scaled_integer<RepType>{expected_value, scale}};
 
     auto const result =
-      cudf::reduce(column, cudf::make_sum_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_sum_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1570,7 +1581,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumLarge)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimalXX>(init_value, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_sum_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_sum_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
 
     EXPECT_EQ(init_result_scalar->fixed_point_value(), init_expected);
@@ -1591,7 +1602,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMin)
     auto const out_type = static_cast<cudf::column_view>(column).type();
 
     auto const result =
-      cudf::reduce(column, cudf::make_min_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_min_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), ONE);
@@ -1601,7 +1612,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMin)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimalXX>(0, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_min_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_min_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
 
     EXPECT_EQ(init_result_scalar->fixed_point_value(), init_expected);
@@ -1623,7 +1634,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMinLarge)
     auto const expected = decimalXX{0, scale};
 
     auto const result =
-      cudf::reduce(column, cudf::make_min_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_min_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1633,7 +1644,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMinLarge)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimalXX>(0, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_min_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_min_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
 
     EXPECT_EQ(init_result_scalar->fixed_point_value(), init_expected);
@@ -1654,7 +1665,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMax)
     auto const out_type = static_cast<cudf::column_view>(column).type();
 
     auto const result =
-      cudf::reduce(column, cudf::make_max_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_max_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), FOUR);
@@ -1664,7 +1675,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMax)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimalXX>(5, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_max_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_max_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
 
     EXPECT_EQ(init_result_scalar->fixed_point_value(), init_expected);
@@ -1686,7 +1697,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMaxLarge)
     auto const expected = decimalXX{scaled_integer<RepType>{42, scale}};
 
     auto const result =
-      cudf::reduce(column, cudf::make_max_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_max_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1696,7 +1707,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMaxLarge)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimalXX>(43, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_max_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_max_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(init_result.get());
 
     EXPECT_EQ(init_result_scalar->fixed_point_value(), init_expected);
@@ -1716,7 +1727,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionNUnique)
     auto const out_type = static_cast<cudf::column_view>(column).type();
 
     auto const result =
-      cudf::reduce(column, cudf::make_nunique_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_nunique_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<cudf::size_type>*>(result.get());
 
     EXPECT_EQ(result_scalar->value(), 4);
@@ -1737,7 +1748,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionSumOfSquares)
     auto const expected = decimalXX{scaled_integer<RepType>{30, scale_type{i * 2}}};
 
     auto const result =
-      cudf::reduce(column, cudf::make_sum_of_squares_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_sum_of_squares_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1758,7 +1769,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMedianOddNumberOfElements)
     auto const expected = decimalXX{scaled_integer<RepType>{2, scale}};
 
     auto const result =
-      cudf::reduce(column, cudf::make_median_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_median_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1779,7 +1790,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionMedianEvenNumberOfElements)
     auto const expected = decimalXX{scaled_integer<RepType>{25, scale}};
 
     auto const result =
-      cudf::reduce(column, cudf::make_median_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_median_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1799,11 +1810,11 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionQuantile)
     auto const out_type = static_cast<cudf::column_view>(column).type();
 
     for (auto const i : {0, 1, 2, 3, 4}) {
-      auto const expected = decimalXX{scaled_integer<RepType>{i + 1, scale}};
-      auto const result   = cudf::reduce(
-        column,
-        cudf::make_quantile_aggregation<reduce_aggregation>({i / 4.0}, cudf::interpolation::LINEAR),
-        out_type);
+      auto const expected      = decimalXX{scaled_integer<RepType>{i + 1, scale}};
+      auto const result        = cudf::reduce(column,
+                                       *cudf::make_quantile_aggregation<reduce_aggregation>(
+                                         {i / 4.0}, cudf::interpolation::LINEAR),
+                                       out_type);
       auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
       EXPECT_EQ(result_scalar->fixed_point_value(), expected);
     }
@@ -1827,7 +1838,7 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointReductionNthElement)
       auto const expected = decimalXX{scaled_integer<RepType>{values[i], scale}};
       auto const result   = cudf::reduce(
         column,
-        cudf::make_nth_element_aggregation<reduce_aggregation>(i, cudf::null_policy::INCLUDE),
+        *cudf::make_nth_element_aggregation<reduce_aggregation>(i, cudf::null_policy::INCLUDE),
         out_type);
       auto const result_scalar = static_cast<cudf::scalar_type_t<decimalXX>*>(result.get());
       EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1851,7 +1862,7 @@ TEST_F(Decimal128Only, Decimal128ProductReduction)
 
     auto const out_type = cudf::data_type{cudf::type_id::DECIMAL128, scale};
     auto const result =
-      cudf::reduce(column, cudf::make_product_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1861,7 +1872,7 @@ TEST_F(Decimal128Only, Decimal128ProductReduction)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimal128>(2, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar =
       static_cast<cudf::scalar_type_t<decimal128>*>(init_result.get());
 
@@ -1882,7 +1893,7 @@ TEST_F(Decimal128Only, Decimal128ProductReduction2)
 
     auto const out_type = cudf::data_type{cudf::type_id::DECIMAL128, scale};
     auto const result =
-      cudf::reduce(column, cudf::make_product_aggregation<reduce_aggregation>(), out_type);
+      cudf::reduce(column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type);
     auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
 
     EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1892,7 +1903,7 @@ TEST_F(Decimal128Only, Decimal128ProductReduction2)
     auto const init_scalar   = cudf::make_fixed_point_scalar<decimal128>(3, scale);
 
     auto const init_result = cudf::reduce(
-      column, cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+      column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
     auto const init_result_scalar =
       static_cast<cudf::scalar_type_t<decimal128>*>(init_result.get());
 
@@ -1914,7 +1925,7 @@ TEST_F(Decimal128Only, Decimal128ProductReduction3)
 
   auto const out_type = cudf::data_type{cudf::type_id::DECIMAL128, scale};
   auto const result =
-    cudf::reduce(column, cudf::make_product_aggregation<reduce_aggregation>(), out_type);
+    cudf::reduce(column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type);
   auto const result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(result.get());
 
   EXPECT_EQ(result_scalar->fixed_point_value(), expected);
@@ -1923,7 +1934,7 @@ TEST_F(Decimal128Only, Decimal128ProductReduction3)
   auto const init_scalar = cudf::make_fixed_point_scalar<decimal128>(5, scale);
 
   auto const init_result = cudf::reduce(
-    column, cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
+    column, *cudf::make_product_aggregation<reduce_aggregation>(), out_type, *init_scalar);
   auto const init_result_scalar = static_cast<cudf::scalar_type_t<decimal128>*>(init_result.get());
 
   EXPECT_EQ(init_result_scalar->fixed_point_value(), expected);
@@ -1955,21 +1966,23 @@ TYPED_TEST(ReductionTest, NthElement)
     auto const index         = mod(n, v.size());
     T expected_value_nonull  = v[index];
     bool const expected_null = host_bools[index];
-    EXPECT_EQ(this
-                ->template reduction_test<T>(col,
-                                             cudf::make_nth_element_aggregation<reduce_aggregation>(
-                                               n, cudf::null_policy::INCLUDE))
-                .first,
-              expected_value_nonull);
-    EXPECT_EQ(this
-                ->template reduction_test<T>(col,
-                                             cudf::make_nth_element_aggregation<reduce_aggregation>(
-                                               n, cudf::null_policy::EXCLUDE))
-                .first,
-              expected_value_nonull);
+    EXPECT_EQ(
+      this
+        ->template reduction_test<T>(
+          col,
+          *cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::INCLUDE))
+        .first,
+      expected_value_nonull);
+    EXPECT_EQ(
+      this
+        ->template reduction_test<T>(
+          col,
+          *cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::EXCLUDE))
+        .first,
+      expected_value_nonull);
     auto res = this->template reduction_test<T>(
       col_nulls,
-      cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::INCLUDE));
+      *cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::INCLUDE));
     EXPECT_EQ(res.first, expected_value_nonull);
     EXPECT_EQ(res.second, expected_null);
   }
@@ -1977,25 +1990,26 @@ TYPED_TEST(ReductionTest, NthElement)
   for (cudf::size_type n :
        {-valid_count, -valid_count / 2, -2, -1, 0, 1, 2, valid_count / 2, valid_count - 1}) {
     T expected_value_null = v_valid[mod(n, v_valid.size())];
-    EXPECT_EQ(this
-                ->template reduction_test<T>(col_nulls,
-                                             cudf::make_nth_element_aggregation<reduce_aggregation>(
-                                               n, cudf::null_policy::EXCLUDE))
-                .first,
-              expected_value_null);
+    EXPECT_EQ(
+      this
+        ->template reduction_test<T>(
+          col_nulls,
+          *cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::EXCLUDE))
+        .first,
+      expected_value_null);
   }
   // error cases
   for (cudf::size_type n : {-input_size - 1, input_size}) {
     EXPECT_ANY_THROW(this->template reduction_test<T>(
-      col, cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::INCLUDE)));
+      col, *cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::INCLUDE)));
     EXPECT_ANY_THROW(this->template reduction_test<T>(
       col_nulls,
-      cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::INCLUDE)));
+      *cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::INCLUDE)));
     EXPECT_ANY_THROW(this->template reduction_test<T>(
-      col, cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::EXCLUDE)));
+      col, *cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::EXCLUDE)));
     EXPECT_ANY_THROW(this->template reduction_test<T>(
       col_nulls,
-      cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::EXCLUDE)));
+      *cudf::make_nth_element_aggregation<reduce_aggregation>(n, cudf::null_policy::EXCLUDE)));
   }
 }
 
@@ -2019,25 +2033,25 @@ TEST_P(DictionaryStringReductionTest, MinMax)
   this->reduction_test(col,
                        *(std::min_element(host_strings.begin(), host_strings.end())),
                        true,
-                       cudf::make_min_aggregation<reduce_aggregation>(),
+                       *cudf::make_min_aggregation<reduce_aggregation>(),
                        output_type);
   // sliced
   this->reduction_test(cudf::slice(col, {1, 7}).front(),
                        *(std::min_element(host_strings.begin() + 1, host_strings.begin() + 7)),
                        true,
-                       cudf::make_min_aggregation<reduce_aggregation>(),
+                       *cudf::make_min_aggregation<reduce_aggregation>(),
                        output_type);
   // MAX
   this->reduction_test(col,
                        *(std::max_element(host_strings.begin(), host_strings.end())),
                        true,
-                       cudf::make_max_aggregation<reduce_aggregation>(),
+                       *cudf::make_max_aggregation<reduce_aggregation>(),
                        output_type);
   // sliced
   this->reduction_test(cudf::slice(col, {1, 7}).front(),
                        *(std::max_element(host_strings.begin() + 1, host_strings.begin() + 7)),
                        true,
-                       cudf::make_max_aggregation<reduce_aggregation>(),
+                       *cudf::make_max_aggregation<reduce_aggregation>(),
                        output_type);
 }
 
@@ -2062,39 +2076,39 @@ TYPED_TEST(DictionaryAnyAllTest, AnyAll)
     cudf::test::dictionary_column_wrapper<T> all_col(v_all.begin(), v_all.end());
     EXPECT_TRUE(this
                   ->template reduction_test<bool>(
-                    all_col, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                    all_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
                   .first);
     EXPECT_TRUE(this
                   ->template reduction_test<bool>(
-                    all_col, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                    all_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
                   .first);
     cudf::test::dictionary_column_wrapper<T> none_col(v_none.begin(), v_none.end());
     EXPECT_FALSE(this
                    ->template reduction_test<bool>(
-                     none_col, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                     none_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
                    .first);
     EXPECT_FALSE(this
                    ->template reduction_test<bool>(
-                     none_col, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                     none_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
                    .first);
     cudf::test::dictionary_column_wrapper<T> some_col(v_some.begin(), v_some.end());
     EXPECT_TRUE(this
                   ->template reduction_test<bool>(
-                    some_col, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                    some_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
                   .first);
     EXPECT_FALSE(this
                    ->template reduction_test<bool>(
-                     some_col, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                     some_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
                    .first);
     // sliced test
     EXPECT_TRUE(this
                   ->template reduction_test<bool>(cudf::slice(some_col, {1, 3}).front(),
-                                                  cudf::make_any_aggregation<reduce_aggregation>(),
+                                                  *cudf::make_any_aggregation<reduce_aggregation>(),
                                                   output_dtype)
                   .first);
     EXPECT_TRUE(this
                   ->template reduction_test<bool>(cudf::slice(some_col, {1, 2}).front(),
-                                                  cudf::make_all_aggregation<reduce_aggregation>(),
+                                                  *cudf::make_all_aggregation<reduce_aggregation>(),
                                                   output_dtype)
                   .first);
   }
@@ -2104,39 +2118,39 @@ TYPED_TEST(DictionaryAnyAllTest, AnyAll)
     cudf::test::dictionary_column_wrapper<T> all_col(v_all.begin(), v_all.end(), valid.begin());
     EXPECT_TRUE(this
                   ->template reduction_test<bool>(
-                    all_col, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                    all_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
                   .first);
     EXPECT_TRUE(this
                   ->template reduction_test<bool>(
-                    all_col, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                    all_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
                   .first);
     cudf::test::dictionary_column_wrapper<T> none_col(v_none.begin(), v_none.end(), valid.begin());
     EXPECT_FALSE(this
                    ->template reduction_test<bool>(
-                     none_col, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                     none_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
                    .first);
     EXPECT_FALSE(this
                    ->template reduction_test<bool>(
-                     none_col, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                     none_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
                    .first);
     cudf::test::dictionary_column_wrapper<T> some_col(v_some.begin(), v_some.end(), valid.begin());
     EXPECT_TRUE(this
                   ->template reduction_test<bool>(
-                    some_col, cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
+                    some_col, *cudf::make_any_aggregation<reduce_aggregation>(), output_dtype)
                   .first);
     EXPECT_FALSE(this
                    ->template reduction_test<bool>(
-                     some_col, cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
+                     some_col, *cudf::make_all_aggregation<reduce_aggregation>(), output_dtype)
                    .first);
     // sliced test
     EXPECT_TRUE(this
                   ->template reduction_test<bool>(cudf::slice(some_col, {0, 3}).front(),
-                                                  cudf::make_any_aggregation<reduce_aggregation>(),
+                                                  *cudf::make_any_aggregation<reduce_aggregation>(),
                                                   output_dtype)
                   .first);
     EXPECT_TRUE(this
                   ->template reduction_test<bool>(cudf::slice(some_col, {1, 4}).front(),
-                                                  cudf::make_all_aggregation<reduce_aggregation>(),
+                                                  *cudf::make_all_aggregation<reduce_aggregation>(),
                                                   output_dtype)
                   .first);
   }
@@ -2160,7 +2174,7 @@ TYPED_TEST(DictionaryReductionTest, Sum)
   T expected_value = std::accumulate(v.begin(), v.end(), T{0});
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col, cudf::make_sum_aggregation<reduce_aggregation>(), output_type)
+                col, *cudf::make_sum_aggregation<reduce_aggregation>(), output_type)
               .first,
             expected_value);
 
@@ -2173,7 +2187,7 @@ TYPED_TEST(DictionaryReductionTest, Sum)
   }();
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col_nulls, cudf::make_sum_aggregation<reduce_aggregation>(), output_type)
+                col_nulls, *cudf::make_sum_aggregation<reduce_aggregation>(), output_type)
               .first,
             expected_value);
 }
@@ -2194,7 +2208,7 @@ TYPED_TEST(DictionaryReductionTest, Product)
 
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col, cudf::make_product_aggregation<reduce_aggregation>(), output_type)
+                col, *cudf::make_product_aggregation<reduce_aggregation>(), output_type)
               .first,
             calc_prod(v));
 
@@ -2204,7 +2218,7 @@ TYPED_TEST(DictionaryReductionTest, Product)
 
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col_nulls, cudf::make_product_aggregation<reduce_aggregation>(), output_type)
+                col_nulls, *cudf::make_product_aggregation<reduce_aggregation>(), output_type)
               .first,
             calc_prod(replace_nulls(v, validity, T{1})));
 }
@@ -2225,7 +2239,7 @@ TYPED_TEST(DictionaryReductionTest, SumOfSquare)
 
   EXPECT_EQ(this
               ->template reduction_test<T>(
-                col, cudf::make_sum_of_squares_aggregation<reduce_aggregation>(), output_type)
+                col, *cudf::make_sum_of_squares_aggregation<reduce_aggregation>(), output_type)
               .first,
             calc_reduction(v));
 
@@ -2233,11 +2247,12 @@ TYPED_TEST(DictionaryReductionTest, SumOfSquare)
   std::vector<bool> validity({1, 1, 0, 0, 1, 1, 1, 1});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
 
-  EXPECT_EQ(this
-              ->template reduction_test<T>(
-                col_nulls, cudf::make_sum_of_squares_aggregation<reduce_aggregation>(), output_type)
-              .first,
-            calc_reduction(replace_nulls(v, validity, T{0})));
+  EXPECT_EQ(
+    this
+      ->template reduction_test<T>(
+        col_nulls, *cudf::make_sum_of_squares_aggregation<reduce_aggregation>(), output_type)
+      .first,
+    calc_reduction(replace_nulls(v, validity, T{0})));
 }
 
 TYPED_TEST(DictionaryReductionTest, Mean)
@@ -2257,7 +2272,7 @@ TYPED_TEST(DictionaryReductionTest, Mean)
 
   EXPECT_EQ(this
               ->template reduction_test<double>(
-                col, cudf::make_mean_aggregation<reduce_aggregation>(), output_type)
+                col, *cudf::make_mean_aggregation<reduce_aggregation>(), output_type)
               .first,
             calc_mean(v, v.size()));
 
@@ -2269,7 +2284,7 @@ TYPED_TEST(DictionaryReductionTest, Mean)
 
   EXPECT_EQ(this
               ->template reduction_test<double>(
-                col_nulls, cudf::make_mean_aggregation<reduce_aggregation>(), output_type)
+                col_nulls, *cudf::make_mean_aggregation<reduce_aggregation>(), output_type)
               .first,
             calc_mean(replace_nulls(v, validity, T{0}), valid_count));
 }
@@ -2304,8 +2319,8 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
   auto var_agg               = cudf::make_variance_aggregation<reduce_aggregation>(ddof);
   auto std_agg               = cudf::make_std_aggregation<reduce_aggregation>(ddof);
 
-  EXPECT_EQ(this->template reduction_test<double>(col, var_agg, output_type).first, var);
-  EXPECT_EQ(this->template reduction_test<double>(col, std_agg, output_type).first, std);
+  EXPECT_EQ(this->template reduction_test<double>(col, *var_agg, output_type).first, var);
+  EXPECT_EQ(this->template reduction_test<double>(col, *std_agg, output_type).first, std);
 
   // test with nulls
   std::vector<bool> validity({1, 1, 0, 1, 1, 1, 0, 1});
@@ -2316,9 +2331,9 @@ TYPED_TEST(DictionaryReductionTest, DISABLED_VarStd)
   double var_nulls = calc_var(replace_nulls(v, validity, T{0}), valid_count, ddof);
   double std_nulls = std::sqrt(var_nulls);
 
-  EXPECT_EQ(this->template reduction_test<double>(col_nulls, var_agg, output_type).first,
+  EXPECT_EQ(this->template reduction_test<double>(col_nulls, *var_agg, output_type).first,
             var_nulls);
-  EXPECT_EQ(this->template reduction_test<double>(col_nulls, std_agg, output_type).first,
+  EXPECT_EQ(this->template reduction_test<double>(col_nulls, *std_agg, output_type).first,
             std_nulls);
 }
 
@@ -2334,7 +2349,7 @@ TYPED_TEST(DictionaryReductionTest, NthElement)
   cudf::size_type n = 5;
   EXPECT_EQ(this
               ->template reduction_test<T>(col,
-                                           cudf::make_nth_element_aggregation<reduce_aggregation>(
+                                           *cudf::make_nth_element_aggregation<reduce_aggregation>(
                                              n, cudf::null_policy::INCLUDE),
                                            output_type)
               .first,
@@ -2346,7 +2361,7 @@ TYPED_TEST(DictionaryReductionTest, NthElement)
 
   EXPECT_EQ(this
               ->template reduction_test<T>(col_nulls,
-                                           cudf::make_nth_element_aggregation<reduce_aggregation>(
+                                           *cudf::make_nth_element_aggregation<reduce_aggregation>(
                                              n, cudf::null_policy::INCLUDE),
                                            output_type)
               .first,
@@ -2355,7 +2370,7 @@ TYPED_TEST(DictionaryReductionTest, NthElement)
     this
       ->template reduction_test<T>(
         col_nulls,
-        cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::INCLUDE),
+        *cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::INCLUDE),
         output_type)
       .second);
 }
@@ -2372,7 +2387,7 @@ TYPED_TEST(DictionaryReductionTest, UniqueCount)
   EXPECT_EQ(this
               ->template reduction_test<int>(
                 col,
-                cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::INCLUDE),
+                *cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::INCLUDE),
                 output_type)
               .first,
             6);
@@ -2384,14 +2399,14 @@ TYPED_TEST(DictionaryReductionTest, UniqueCount)
   EXPECT_EQ(this
               ->template reduction_test<int>(
                 col_nulls,
-                cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::INCLUDE),
+                *cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::INCLUDE),
                 output_type)
               .first,
             7);
   EXPECT_EQ(this
               ->template reduction_test<int>(
                 col_nulls,
-                cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::EXCLUDE),
+                *cudf::make_nunique_aggregation<reduce_aggregation>(cudf::null_policy::EXCLUDE),
                 output_type)
               .first,
             6);
@@ -2402,12 +2417,11 @@ TYPED_TEST(DictionaryReductionTest, Median)
   using T = TypeParam;
   std::vector<int> int_values({6, -14, 13, 64, 0, -13, -20, 45});
   std::vector<T> v = convert_values<T>(int_values);
-  cudf::data_type output_type{cudf::type_to_id<double>()};
 
   // test without nulls
   cudf::test::dictionary_column_wrapper<T> col(v.begin(), v.end());
   EXPECT_EQ(
-    this->template reduction_test<double>(col, cudf::make_median_aggregation<reduce_aggregation>())
+    this->template reduction_test<double>(col, *cudf::make_median_aggregation<reduce_aggregation>())
       .first,
     (std::is_signed_v<T>) ? 3.0 : 13.5);
 
@@ -2415,8 +2429,8 @@ TYPED_TEST(DictionaryReductionTest, Median)
   std::vector<bool> validity({1, 1, 1, 0, 1, 1, 1, 1});
   cudf::test::dictionary_column_wrapper<T> col_nulls(v.begin(), v.end(), validity.begin());
   EXPECT_EQ(this
-              ->template reduction_test<double>(col_nulls,
-                                                cudf::make_median_aggregation<reduce_aggregation>())
+              ->template reduction_test<double>(
+                col_nulls, *cudf::make_median_aggregation<reduce_aggregation>())
               .first,
             (std::is_signed_v<T>) ? 0.0 : 13.0);
 }
@@ -2427,19 +2441,18 @@ TYPED_TEST(DictionaryReductionTest, Quantile)
   std::vector<int> int_values({6, -14, 13, 64, 0, -13, -20, 45});
   std::vector<T> v = convert_values<T>(int_values);
   cudf::interpolation interp{cudf::interpolation::LINEAR};
-  cudf::data_type output_type{cudf::type_to_id<double>()};
 
   // test without nulls
   cudf::test::dictionary_column_wrapper<T> col(v.begin(), v.end());
   double expected_value = std::is_same_v<T, bool> || std::is_unsigned_v<T> ? 0.0 : -20.0;
   EXPECT_EQ(this
               ->template reduction_test<double>(
-                col, cudf::make_quantile_aggregation<reduce_aggregation>({0.0}, interp))
+                col, *cudf::make_quantile_aggregation<reduce_aggregation>({0.0}, interp))
               .first,
             expected_value);
   EXPECT_EQ(this
               ->template reduction_test<double>(
-                col, cudf::make_quantile_aggregation<reduce_aggregation>({1.0}, interp))
+                col, *cudf::make_quantile_aggregation<reduce_aggregation>({1.0}, interp))
               .first,
             64.0);
 
@@ -2449,12 +2462,12 @@ TYPED_TEST(DictionaryReductionTest, Quantile)
 
   EXPECT_EQ(this
               ->template reduction_test<double>(
-                col_nulls, cudf::make_quantile_aggregation<reduce_aggregation>({0}, interp))
+                col_nulls, *cudf::make_quantile_aggregation<reduce_aggregation>({0}, interp))
               .first,
             expected_value);
   EXPECT_EQ(this
               ->template reduction_test<double>(
-                col_nulls, cudf::make_quantile_aggregation<reduce_aggregation>({1}, interp))
+                col_nulls, *cudf::make_quantile_aggregation<reduce_aggregation>({1}, interp))
               .first,
             45.0);
 }
@@ -2464,7 +2477,7 @@ struct ListReductionTest : public cudf::test::BaseFixture {
                       cudf::column_view const& expected_value,
                       bool succeeded_condition,
                       bool is_valid,
-                      std::unique_ptr<reduce_aggregation> const& agg)
+                      reduce_aggregation const& agg)
   {
     auto statement = [&]() {
       std::unique_ptr<cudf::scalar> result =
@@ -2498,7 +2511,7 @@ TEST_F(ListReductionTest, ListReductionNthElement)
     ElementCol{0, 5, -3},  // expected_value,
     true,
     true,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::INCLUDE));
 
   // test with null-exclude
   std::vector<bool> validity{1, 0, 0, 1, 1, 0};
@@ -2508,7 +2521,7 @@ TEST_F(ListReductionTest, ListReductionNthElement)
     ElementCol{-2},  // expected_value,
     true,
     true,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(1, cudf::null_policy::EXCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(1, cudf::null_policy::EXCLUDE));
 
   // test with null-include
   this->reduction_test(
@@ -2516,7 +2529,7 @@ TEST_F(ListReductionTest, ListReductionNthElement)
     ElementCol{},  // expected_value,
     true,
     false,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(1, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(1, cudf::null_policy::INCLUDE));
 }
 
 TEST_F(ListReductionTest, NestedListReductionNthElement)
@@ -2533,7 +2546,7 @@ TEST_F(ListReductionTest, NestedListReductionNthElement)
     LCW{{}, {2, 3, 4}},  // expected_value,
     true,
     true,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
 
   // test with null-include
   this->reduction_test(
@@ -2541,7 +2554,7 @@ TEST_F(ListReductionTest, NestedListReductionNthElement)
     LCW{},  // expected_value,
     true,
     false,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::INCLUDE));
 
   // test with null-exclude
   this->reduction_test(
@@ -2549,7 +2562,7 @@ TEST_F(ListReductionTest, NestedListReductionNthElement)
     LCW{{11}, {12, 13}},  // expected_value,
     true,
     true,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::EXCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::EXCLUDE));
 }
 
 TEST_F(ListReductionTest, NonValidListReductionNthElement)
@@ -2564,7 +2577,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
     ElementCol{},  // expected_value,
     true,
     false,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
 
   // test against empty input
   this->reduction_test(
@@ -2572,7 +2585,7 @@ TEST_F(ListReductionTest, NonValidListReductionNthElement)
     ElementCol{},  // expected_value,
     true,
     false,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
 }
 
 struct StructReductionTest : public cudf::test::BaseFixture {
@@ -2582,7 +2595,7 @@ struct StructReductionTest : public cudf::test::BaseFixture {
                       cudf::table_view const& expected_value,
                       bool succeeded_condition,
                       bool is_valid,
-                      std::unique_ptr<reduce_aggregation> const& agg)
+                      reduce_aggregation const& agg)
   {
     auto statement = [&]() {
       std::unique_ptr<cudf::scalar> result =
@@ -2622,7 +2635,7 @@ TEST_F(StructReductionTest, StructReductionNthElement)
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
     true,
     true,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(2, cudf::null_policy::INCLUDE));
 
   // test with null-include
   std::vector<bool> validity{1, 1, 1, 0, 1, 0, 0, 1};
@@ -2639,7 +2652,7 @@ TEST_F(StructReductionTest, StructReductionNthElement)
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
     true,
     false,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(6, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(6, cudf::null_policy::INCLUDE));
 
   // test with null-exclude
   result_col0 = ICW{{28}, {1}};
@@ -2650,7 +2663,7 @@ TEST_F(StructReductionTest, StructReductionNthElement)
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
     true,
     true,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(4, cudf::null_policy::EXCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(4, cudf::null_policy::EXCLUDE));
 }
 
 TEST_F(StructReductionTest, NestedStructReductionNthElement)
@@ -2673,7 +2686,7 @@ TEST_F(StructReductionTest, NestedStructReductionNthElement)
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
     true,
     true,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(1, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(1, cudf::null_policy::INCLUDE));
 
   // test with null-include
   result_child0 = ICW{0};
@@ -2685,7 +2698,7 @@ TEST_F(StructReductionTest, NestedStructReductionNthElement)
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
     true,
     false,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(3, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(3, cudf::null_policy::INCLUDE));
 
   // test with null-exclude
   result_child0 = ICW{0};
@@ -2697,7 +2710,7 @@ TEST_F(StructReductionTest, NestedStructReductionNthElement)
     cudf::table_view{{result_col0, result_col1, result_col2}},  // expected_value,
     true,
     true,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(3, cudf::null_policy::EXCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(3, cudf::null_policy::EXCLUDE));
 }
 
 TEST_F(StructReductionTest, NonValidStructReductionNthElement)
@@ -2717,7 +2730,7 @@ TEST_F(StructReductionTest, NonValidStructReductionNthElement)
     cudf::table_view{{ret_col0, ret_col1, ret_col2}},  // expected_value,
     true,
     false,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
 
   // test against empty input (would fail because we can not create empty struct scalar)
   child0     = ICW{};
@@ -2732,7 +2745,7 @@ TEST_F(StructReductionTest, NonValidStructReductionNthElement)
     cudf::table_view{{ret_col0, ret_col1, ret_col2}},  // expected_value,
     false,
     false,
-    cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
+    *cudf::make_nth_element_aggregation<reduce_aggregation>(0, cudf::null_policy::INCLUDE));
 }
 
 TEST_F(StructReductionTest, StructReductionMinMaxNoNull)
@@ -2754,7 +2767,7 @@ TEST_F(StructReductionTest, StructReductionMinMaxNoNull)
                          cudf::table_view{{expected_child1, expected_child2}},
                          true,
                          true,
-                         cudf::make_min_aggregation<reduce_aggregation>());
+                         *cudf::make_min_aggregation<reduce_aggregation>());
   }
 
   {
@@ -2764,7 +2777,7 @@ TEST_F(StructReductionTest, StructReductionMinMaxNoNull)
                          cudf::table_view{{expected_child1, expected_child2}},
                          true,
                          true,
-                         cudf::make_max_aggregation<reduce_aggregation>());
+                         *cudf::make_max_aggregation<reduce_aggregation>());
   }
 }
 
@@ -2802,7 +2815,7 @@ TEST_F(StructReductionTest, StructReductionMinMaxSlicedInput)
                          cudf::table_view{{expected_child1, expected_child2}},
                          true,
                          true,
-                         cudf::make_min_aggregation<reduce_aggregation>());
+                         *cudf::make_min_aggregation<reduce_aggregation>());
   }
 
   {
@@ -2812,7 +2825,7 @@ TEST_F(StructReductionTest, StructReductionMinMaxSlicedInput)
                          cudf::table_view{{expected_child1, expected_child2}},
                          true,
                          true,
-                         cudf::make_max_aggregation<reduce_aggregation>());
+                         *cudf::make_max_aggregation<reduce_aggregation>());
   }
 }
 
@@ -2851,7 +2864,7 @@ TEST_F(StructReductionTest, StructReductionMinMaxWithNulls)
                          cudf::table_view{{expected_child1, expected_child2}},
                          true,
                          true,
-                         cudf::make_min_aggregation<reduce_aggregation>());
+                         *cudf::make_min_aggregation<reduce_aggregation>());
   }
 
   {
@@ -2861,7 +2874,7 @@ TEST_F(StructReductionTest, StructReductionMinMaxWithNulls)
                          cudf::table_view{{expected_child1, expected_child2}},
                          true,
                          true,
-                         cudf::make_max_aggregation<reduce_aggregation>());
+                         *cudf::make_max_aggregation<reduce_aggregation>());
   }
 }
 
diff --git a/cpp/tests/reductions/scan_tests.cpp b/cpp/tests/reductions/scan_tests.cpp
index 2053837fdd1..ec5bd182049 100644
--- a/cpp/tests/reductions/scan_tests.cpp
+++ b/cpp/tests/reductions/scan_tests.cpp
@@ -44,7 +44,7 @@ struct ScanTest : public BaseScanTest<T> {
 
   void scan_test(cudf::host_span<HostType const> v,
                  cudf::host_span<bool const> b,
-                 std::unique_ptr<scan_aggregation> const& agg,
+                 scan_aggregation const& agg,
                  scan_type inclusive,
                  null_policy null_handling,
                  numeric::scale_type scale)
@@ -63,7 +63,7 @@ struct ScanTest : public BaseScanTest<T> {
   // Overload to iterate the test over a few different scales for fixed-point tests
   void scan_test(cudf::host_span<HostType const> v,
                  cudf::host_span<bool const> b,
-                 std::unique_ptr<scan_aggregation> const& agg,
+                 scan_aggregation const& agg,
                  scan_type inclusive,
                  null_policy null_handling = null_policy::EXCLUDE)
   {
@@ -76,10 +76,10 @@ struct ScanTest : public BaseScanTest<T> {
     }
   }
 
-  bool params_supported(std::unique_ptr<scan_aggregation> const& agg, scan_type inclusive)
+  bool params_supported(scan_aggregation const& agg, scan_type inclusive)
   {
     bool supported = [&] {
-      switch (agg->kind) {
+      switch (agg.kind) {
         case aggregation::SUM: return std::is_invocable_v<cudf::DeviceSum, T, T>;
         case aggregation::PRODUCT: return std::is_invocable_v<cudf::DeviceProduct, T, T>;
         case aggregation::MIN: return std::is_invocable_v<cudf::DeviceMin, T, T>;
@@ -91,17 +91,16 @@ struct ScanTest : public BaseScanTest<T> {
     }();
 
     // special cases for individual types
-    if constexpr (cudf::is_fixed_point<T>())
-      return supported && (agg->kind != aggregation::PRODUCT);
+    if constexpr (cudf::is_fixed_point<T>()) return supported && (agg.kind != aggregation::PRODUCT);
     if constexpr (std::is_same_v<T, cudf::string_view> || cudf::is_timestamp<T>())
       return supported && (inclusive == scan_type::INCLUSIVE);
     return supported;
   }
 
-  std::function<HostType(HostType, HostType)> make_agg(std::unique_ptr<scan_aggregation> const& agg)
+  std::function<HostType(HostType, HostType)> make_agg(scan_aggregation const& agg)
   {
     if constexpr (std::is_same_v<T, cudf::string_view>) {
-      switch (agg->kind) {
+      switch (agg.kind) {
         case aggregation::MIN: return [](HostType a, HostType b) { return std::min(a, b); };
         case aggregation::MAX: return [](HostType a, HostType b) { return std::max(a, b); };
         default: {
@@ -110,7 +109,7 @@ struct ScanTest : public BaseScanTest<T> {
         }
       }
     } else {
-      switch (agg->kind) {
+      switch (agg.kind) {
         case aggregation::SUM: return std::plus<HostType>{};
         case aggregation::PRODUCT: return std::multiplies<HostType>{};
         case aggregation::MIN: return [](HostType a, HostType b) { return std::min(a, b); };
@@ -123,16 +122,16 @@ struct ScanTest : public BaseScanTest<T> {
     }
   }
 
-  HostType make_identity(std::unique_ptr<scan_aggregation> const& agg)
+  HostType make_identity(scan_aggregation const& agg)
   {
     if constexpr (std::is_same_v<T, cudf::string_view>) {
-      switch (agg->kind) {
+      switch (agg.kind) {
         case aggregation::MIN: return std::string{"\xF7\xBF\xBF\xBF"};
         case aggregation::MAX: return std::string{};
         default: CUDF_FAIL("Unsupported aggregation");
       }
     } else {
-      switch (agg->kind) {
+      switch (agg.kind) {
         case aggregation::SUM: return HostType{0};
         case aggregation::PRODUCT: return HostType{1};
         case aggregation::MIN:
@@ -154,7 +153,7 @@ struct ScanTest : public BaseScanTest<T> {
 
   std::unique_ptr<cudf::column> make_expected(cudf::host_span<HostType const> v,
                                               cudf::host_span<bool const> b,
-                                              std::unique_ptr<scan_aggregation> const& agg,
+                                              scan_aggregation const& agg,
                                               scan_type inclusive,
                                               null_policy null_handling,
                                               numeric::scale_type scale = numeric::scale_type{0})
@@ -220,28 +219,28 @@ TYPED_TEST(ScanTest, Min)
   auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 0, 1, 1, 1, 1, 0, 0, 1});
 
   // no nulls
-  this->scan_test(v, {}, cudf::make_min_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
-  this->scan_test(v, {}, cudf::make_min_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_min_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_min_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
   // skipna = true (default)
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::EXCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::EXCLUDE);
   // skipna = false
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::INCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::INCLUDE);
 }
@@ -253,28 +252,28 @@ TYPED_TEST(ScanTest, Max)
 
   // inclusive
   // no nulls
-  this->scan_test(v, {}, cudf::make_max_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
-  this->scan_test(v, {}, cudf::make_max_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_max_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_max_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
   // skipna = true (default)
   this->scan_test(v,
                   b,
-                  cudf::make_max_aggregation<scan_aggregation>(),
+                  *cudf::make_max_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::EXCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_max_aggregation<scan_aggregation>(),
+                  *cudf::make_max_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::EXCLUDE);
   // skipna = false
   this->scan_test(v,
                   b,
-                  cudf::make_max_aggregation<scan_aggregation>(),
+                  *cudf::make_max_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::INCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_max_aggregation<scan_aggregation>(),
+                  *cudf::make_max_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::INCLUDE);
 }
@@ -285,28 +284,28 @@ TYPED_TEST(ScanTest, Product)
   auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 1, 1, 0, 1, 1});
 
   // no nulls
-  this->scan_test(v, {}, cudf::make_product_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
-  this->scan_test(v, {}, cudf::make_product_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_product_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_product_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
   // skipna = true (default)
   this->scan_test(v,
                   b,
-                  cudf::make_product_aggregation<scan_aggregation>(),
+                  *cudf::make_product_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::EXCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_product_aggregation<scan_aggregation>(),
+                  *cudf::make_product_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::EXCLUDE);
   // skipna = false
   this->scan_test(v,
                   b,
-                  cudf::make_product_aggregation<scan_aggregation>(),
+                  *cudf::make_product_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::INCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_product_aggregation<scan_aggregation>(),
+                  *cudf::make_product_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::INCLUDE);
 }
@@ -321,28 +320,28 @@ TYPED_TEST(ScanTest, Sum)
   auto const b = thrust::host_vector<bool>(std::vector<bool>{1, 0, 1, 1, 0, 0, 1, 1, 1, 1});
 
   // no nulls
-  this->scan_test(v, {}, cudf::make_sum_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
-  this->scan_test(v, {}, cudf::make_sum_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_sum_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_sum_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
   // skipna = true (default)
   this->scan_test(v,
                   b,
-                  cudf::make_sum_aggregation<scan_aggregation>(),
+                  *cudf::make_sum_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::EXCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_sum_aggregation<scan_aggregation>(),
+                  *cudf::make_sum_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::EXCLUDE);
   // skipna = false
   this->scan_test(v,
                   b,
-                  cudf::make_sum_aggregation<scan_aggregation>(),
+                  *cudf::make_sum_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::INCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_sum_aggregation<scan_aggregation>(),
+                  *cudf::make_sum_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::INCLUDE);
 }
@@ -355,23 +354,23 @@ TYPED_TEST(ScanTest, EmptyColumn)
   // skipna = true (default)
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::EXCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::EXCLUDE);
   // skipna = false
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::INCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::INCLUDE);
 }
@@ -384,23 +383,23 @@ TYPED_TEST(ScanTest, LeadingNulls)
   // skipna = true (default)
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::EXCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::EXCLUDE);
   // skipna = false
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::INCLUDE);
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::INCLUDE);
 }
@@ -423,35 +422,35 @@ TEST_F(ScanStringsTest, MoreStringsMinMax)
   thrust::host_vector<std::string> v(data_begin, data_begin + row_count);
   thrust::host_vector<bool> b(validity, validity + row_count);
 
-  this->scan_test(v, {}, cudf::make_min_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
-  this->scan_test(v, b, cudf::make_min_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_min_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
+  this->scan_test(v, b, *cudf::make_min_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::EXCLUDE);
 
-  this->scan_test(v, {}, cudf::make_min_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
-  this->scan_test(v, b, cudf::make_min_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_min_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
+  this->scan_test(v, b, *cudf::make_min_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
   this->scan_test(v,
                   b,
-                  cudf::make_min_aggregation<scan_aggregation>(),
+                  *cudf::make_min_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::EXCLUDE);
 
-  this->scan_test(v, {}, cudf::make_max_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
-  this->scan_test(v, b, cudf::make_max_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_max_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
+  this->scan_test(v, b, *cudf::make_max_aggregation<scan_aggregation>(), scan_type::INCLUSIVE);
   this->scan_test(v,
                   b,
-                  cudf::make_max_aggregation<scan_aggregation>(),
+                  *cudf::make_max_aggregation<scan_aggregation>(),
                   scan_type::INCLUSIVE,
                   null_policy::EXCLUDE);
 
-  this->scan_test(v, {}, cudf::make_max_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
-  this->scan_test(v, b, cudf::make_max_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
+  this->scan_test(v, {}, *cudf::make_max_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
+  this->scan_test(v, b, *cudf::make_max_aggregation<scan_aggregation>(), scan_type::EXCLUSIVE);
   this->scan_test(v,
                   b,
-                  cudf::make_max_aggregation<scan_aggregation>(),
+                  *cudf::make_max_aggregation<scan_aggregation>(),
                   scan_type::EXCLUSIVE,
                   null_policy::EXCLUDE);
 }
@@ -470,11 +469,11 @@ TYPED_TEST(ScanChronoTest, ChronoMinMax)
                                                                           {1, 1, 1, 0, 1, 1, 1, 1});
 
   auto result =
-    cudf::scan(col, cudf::make_min_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
+    cudf::scan(col, *cudf::make_min_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected_min);
 
   result = cudf::scan(col,
-                      cudf::make_min_aggregation<scan_aggregation>(),
+                      *cudf::make_min_aggregation<scan_aggregation>(),
                       cudf::scan_type::INCLUSIVE,
                       cudf::null_policy::EXCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected_min);
@@ -482,20 +481,20 @@ TYPED_TEST(ScanChronoTest, ChronoMinMax)
   cudf::test::fixed_width_column_wrapper<TypeParam, int32_t> expected_max({5, 5, 6, 0, 6, 6, 6, 6},
                                                                           {1, 1, 1, 0, 1, 1, 1, 1});
   result =
-    cudf::scan(col, cudf::make_max_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
+    cudf::scan(col, *cudf::make_max_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected_max);
 
   result = cudf::scan(col,
-                      cudf::make_max_aggregation<scan_aggregation>(),
+                      *cudf::make_max_aggregation<scan_aggregation>(),
                       cudf::scan_type::INCLUSIVE,
                       cudf::null_policy::EXCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected_max);
 
   EXPECT_THROW(
-    cudf::scan(col, cudf::make_max_aggregation<scan_aggregation>(), cudf::scan_type::EXCLUSIVE),
+    cudf::scan(col, *cudf::make_max_aggregation<scan_aggregation>(), cudf::scan_type::EXCLUSIVE),
     cudf::logic_error);
   EXPECT_THROW(
-    cudf::scan(col, cudf::make_min_aggregation<scan_aggregation>(), cudf::scan_type::EXCLUSIVE),
+    cudf::scan(col, *cudf::make_min_aggregation<scan_aggregation>(), cudf::scan_type::EXCLUSIVE),
     cudf::logic_error);
 }
 
@@ -513,17 +512,17 @@ TYPED_TEST(ScanDurationTest, Sum)
                                                                       {1, 1, 1, 0, 1, 1, 1, 1});
 
   auto result =
-    cudf::scan(col, cudf::make_sum_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
+    cudf::scan(col, *cudf::make_sum_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
   result = cudf::scan(col,
-                      cudf::make_sum_aggregation<scan_aggregation>(),
+                      *cudf::make_sum_aggregation<scan_aggregation>(),
                       cudf::scan_type::INCLUSIVE,
                       cudf::null_policy::EXCLUDE);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
   EXPECT_THROW(
-    cudf::scan(col, cudf::make_sum_aggregation<scan_aggregation>(), cudf::scan_type::EXCLUSIVE),
+    cudf::scan(col, *cudf::make_sum_aggregation<scan_aggregation>(), cudf::scan_type::EXCLUSIVE),
     cudf::logic_error);
 }
 
@@ -548,8 +547,8 @@ TEST_F(StructScanTest, StructScanMinMaxNoNull)
       auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8};
       return STRUCTS_CW{{child1, child2}};
     }();
-    auto const result =
-      cudf::scan(input, cudf::make_min_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
+    auto const result = cudf::scan(
+      input, *cudf::make_min_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
   }
 
@@ -559,8 +558,8 @@ TEST_F(StructScanTest, StructScanMinMaxNoNull)
       auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3};
       return STRUCTS_CW{{child1, child2}};
     }();
-    auto const result =
-      cudf::scan(input, cudf::make_max_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
+    auto const result = cudf::scan(
+      input, *cudf::make_max_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
   }
 }
@@ -598,8 +597,8 @@ TEST_F(StructScanTest, StructScanMinMaxSlicedInput)
       auto child2 = INTS_CW{1, 1, 1, 4, 4, 4, 4, 8, 8, 8};
       return STRUCTS_CW{{child1, child2}};
     }();
-    auto const result =
-      cudf::scan(input, cudf::make_min_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
+    auto const result = cudf::scan(
+      input, *cudf::make_min_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
   }
 
@@ -609,8 +608,8 @@ TEST_F(StructScanTest, StructScanMinMaxSlicedInput)
       auto child2 = INTS_CW{1, 2, 3, 3, 3, 3, 3, 3, 3, 3};
       return STRUCTS_CW{{child1, child2}};
     }();
-    auto const result =
-      cudf::scan(input, cudf::make_max_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
+    auto const result = cudf::scan(
+      input, *cudf::make_max_aggregation<scan_aggregation>(), cudf::scan_type::INCLUSIVE);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
   }
 }
@@ -670,7 +669,7 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls)
     }();
 
     auto const result = cudf::scan(input,
-                                   cudf::make_min_aggregation<scan_aggregation>(),
+                                   *cudf::make_min_aggregation<scan_aggregation>(),
                                    cudf::scan_type::INCLUSIVE,
                                    null_policy::EXCLUDE);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -685,7 +684,7 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls)
     }();
 
     auto const result = cudf::scan(input,
-                                   cudf::make_max_aggregation<scan_aggregation>(),
+                                   *cudf::make_max_aggregation<scan_aggregation>(),
                                    cudf::scan_type::INCLUSIVE,
                                    null_policy::EXCLUDE);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -719,7 +718,7 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls)
     }();
 
     auto const result = cudf::scan(input,
-                                   cudf::make_min_aggregation<scan_aggregation>(),
+                                   *cudf::make_min_aggregation<scan_aggregation>(),
                                    cudf::scan_type::INCLUSIVE,
                                    null_policy::INCLUDE);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
@@ -751,7 +750,7 @@ TEST_F(StructScanTest, StructScanMinMaxWithNulls)
     }();
 
     auto const result = cudf::scan(input,
-                                   cudf::make_max_aggregation<scan_aggregation>(),
+                                   *cudf::make_max_aggregation<scan_aggregation>(),
                                    cudf::scan_type::INCLUSIVE,
                                    null_policy::INCLUDE);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, result->view());
diff --git a/cpp/tests/reductions/segmented_reduction_tests.cpp b/cpp/tests/reductions/segmented_reduction_tests.cpp
index 4fd62f9b938..c0c4f580393 100644
--- a/cpp/tests/reductions/segmented_reduction_tests.cpp
+++ b/cpp/tests/reductions/segmented_reduction_tests.cpp
@@ -19,6 +19,7 @@
 #include <cudf_test/type_lists.hpp>
 
 #include <cudf/aggregation.hpp>
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/reduction.hpp>
 #include <cudf/scalar/scalar_factories.hpp>
 #include <cudf/types.hpp>
@@ -52,7 +53,8 @@ TYPED_TEST(SegmentedReductionTest, SumExcludeNulls)
   auto const input   = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
                                                            {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
   auto const expect =
     fixed_width_column_wrapper<TypeParam>{{6, 4, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -98,7 +100,8 @@ TYPED_TEST(SegmentedReductionTest, ProductExcludeNulls)
   auto const input   = fixed_width_column_wrapper<TypeParam>{{1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX},
                                                            {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
   auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
   auto const expect =
     fixed_width_column_wrapper<TypeParam>{{15, 15, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -144,7 +147,8 @@ TYPED_TEST(SegmentedReductionTest, MaxExcludeNulls)
   auto const input   = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
                                                            {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
   auto const expect =
     fixed_width_column_wrapper<TypeParam>{{3, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -190,7 +194,8 @@ TYPED_TEST(SegmentedReductionTest, MinExcludeNulls)
   auto const input   = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
                                                            {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
   auto const expect =
     fixed_width_column_wrapper<TypeParam>{{1, 1, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}};
 
@@ -236,9 +241,10 @@ TYPED_TEST(SegmentedReductionTest, AnyExcludeNulls)
   auto const input = fixed_width_column_wrapper<TypeParam>{
     {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX},
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets   = std::vector<size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
-  auto const expect    = fixed_width_column_wrapper<bool>{
+  auto const offsets = std::vector<size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const expect = fixed_width_column_wrapper<bool>{
     {false, false, true, true, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, true, true, true, false, true, true, false, false}};
 
@@ -285,9 +291,10 @@ TYPED_TEST(SegmentedReductionTest, AllExcludeNulls)
   auto const input = fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0},
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
-  auto const offsets   = std::vector<size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
-  auto const expect    = fixed_width_column_wrapper<bool>{
+  auto const offsets = std::vector<size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const expect = fixed_width_column_wrapper<bool>{
     {true, true, bool{XXX}, true, bool{XXX}, bool{XXX}, false, false, false},
     {true, true, false, true, false, false, true, true, true}};
 
@@ -335,7 +342,8 @@ TYPED_TEST(SegmentedReductionTest, SumIncludeNulls)
   auto const input   = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
                                                            {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
   auto const expect =
     fixed_width_column_wrapper<TypeParam>{{6, XXX, 1, XXX, XXX, XXX}, {1, 0, 1, 0, 0, 0}};
 
@@ -384,7 +392,8 @@ TYPED_TEST(SegmentedReductionTest, ProductIncludeNulls)
   auto const input   = fixed_width_column_wrapper<TypeParam>{{1, 3, 5, XXX, 3, 5, 1, XXX, XXX, XXX},
                                                            {1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
   auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
   auto const expect =
     fixed_width_column_wrapper<TypeParam>{{15, XXX, 1, XXX, XXX, XXX}, {1, 0, 1, 0, 0, 0}};
 
@@ -433,7 +442,8 @@ TYPED_TEST(SegmentedReductionTest, MaxIncludeNulls)
   auto const input   = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
                                                            {1, 1, 1, 1, 0, 1, 1, 0, 0, 0}};
   auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
   auto const expect =
     fixed_width_column_wrapper<TypeParam>{{3, XXX, 1, XXX, XXX, XXX}, {1, 0, 1, 0, 0, 0}};
 
@@ -482,7 +492,8 @@ TYPED_TEST(SegmentedReductionTest, MinIncludeNulls)
   auto const input   = fixed_width_column_wrapper<TypeParam>{{1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX},
                                                            {1, 1, 1, 1, 0, 1, 1, 0, 0}};
   auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
   auto const expect =
     fixed_width_column_wrapper<TypeParam>{{1, XXX, 1, XXX, XXX, XXX}, {1, 0, 1, 0, 0, 0}};
 
@@ -531,9 +542,10 @@ TYPED_TEST(SegmentedReductionTest, AnyIncludeNulls)
   auto const input = fixed_width_column_wrapper<TypeParam>{
     {0, 0, 0, 0, XXX, 0, 0, 1, 0, 1, XXX, 0, 0, 1, XXX, XXX, XXX},
     {1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0}};
-  auto const offsets   = std::vector<size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
-  auto const expect    = fixed_width_column_wrapper<bool>{
+  auto const offsets = std::vector<size_type>{0, 3, 6, 9, 12, 12, 13, 14, 15, 17};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const expect = fixed_width_column_wrapper<bool>{
     {false, bool{XXX}, true, bool{XXX}, bool{XXX}, false, true, bool{XXX}, bool{XXX}},
     {true, false, true, false, false, true, true, false, false}};
 
@@ -592,9 +604,10 @@ TYPED_TEST(SegmentedReductionTest, AllIncludeNulls)
   auto const input = fixed_width_column_wrapper<TypeParam>{
     {1, 2, 3, 1, XXX, 3, 1, XXX, XXX, XXX, 1, 0, 3, 1, XXX, 0, 0},
     {1, 1, 1, 1, 0, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1}};
-  auto const offsets   = std::vector<size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
-  auto const expect    = fixed_width_column_wrapper<bool>{
+  auto const offsets = std::vector<size_type>{0, 3, 6, 6, 7, 8, 10, 13, 16, 17};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const expect = fixed_width_column_wrapper<bool>{
     {true, bool{XXX}, bool{XXX}, true, bool{XXX}, bool{XXX}, false, bool{XXX}, false},
     {true, false, false, true, false, false, true, false, true}};
 
@@ -655,9 +668,10 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
 
   auto const input = fixed_width_column_wrapper<int32_t>{
     {1, 2, 3, 4, 5, 6, 7}, {true, true, true, true, true, true, true}};
-  auto const offsets   = std::vector<size_type>{0, 1, 3, 4};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
-  auto const expect    = fixed_width_column_wrapper<int32_t>{{1, 5, 4}, {true, true, true}};
+  auto const offsets = std::vector<size_type>{1, 3, 4};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const expect = fixed_width_column_wrapper<int32_t>{{5, 4}, {true, true}};
 
   auto res = segmented_reduce(input,
                               d_offsets,
@@ -669,7 +683,7 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
 
   // Test with initial value
   auto const init_scalar = cudf::make_fixed_width_scalar<int32_t>(3);
-  auto const init_expect = fixed_width_column_wrapper<int32_t>{{4, 8, 7}, {true, true, true}};
+  auto const init_expect = fixed_width_column_wrapper<int32_t>{{8, 7}, {true, true}};
 
   res = segmented_reduce(input,
                          d_offsets,
@@ -681,8 +695,7 @@ TEST_F(SegmentedReductionTestUntyped, PartialSegmentReduction)
 
   // Test with null initial value
   init_scalar->set_valid_async(false);
-  auto null_init_expect =
-    fixed_width_column_wrapper<int32_t>{{XXX, XXX, XXX}, {false, false, false}};
+  auto null_init_expect = fixed_width_column_wrapper<int32_t>{{XXX, XXX}, {false, false}};
 
   res = segmented_reduce(input,
                          d_offsets,
@@ -703,9 +716,10 @@ TEST_F(SegmentedReductionTestUntyped, NonNullableInput)
   // outputs: {1, 5, 4}
   // output nullmask: {1, 1, 1}
 
-  auto const input     = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
-  auto const offsets   = std::vector<size_type>{0, 1, 1, 3, 7};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
+  auto const input   = fixed_width_column_wrapper<int32_t>{1, 2, 3, 4, 5, 6, 7};
+  auto const offsets = std::vector<size_type>{0, 1, 1, 3, 7};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
   auto const expect =
     fixed_width_column_wrapper<int32_t>{{1, XXX, 5, 22}, {true, false, true, true}};
 
@@ -746,10 +760,11 @@ TEST_F(SegmentedReductionTestUntyped, NonNullableInput)
 
 TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
 {
-  auto const input     = fixed_width_column_wrapper<int32_t>{};
-  auto const offsets   = std::vector<size_type>{0};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
-  auto const expect    = fixed_width_column_wrapper<int32_t>{};
+  auto const input   = fixed_width_column_wrapper<int32_t>{};
+  auto const offsets = std::vector<size_type>{0};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const expect = fixed_width_column_wrapper<int32_t>{};
 
   auto res = segmented_reduce(input,
                               d_offsets,
@@ -781,9 +796,10 @@ TEST_F(SegmentedReductionTestUntyped, ReduceEmptyColumn)
 
 TEST_F(SegmentedReductionTestUntyped, EmptyInputWithOffsets)
 {
-  auto const input     = fixed_width_column_wrapper<int32_t>{};
-  auto const offsets   = std::vector<size_type>{0, 0, 0, 0, 0, 0};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
+  auto const input   = fixed_width_column_wrapper<int32_t>{};
+  auto const offsets = std::vector<size_type>{0, 0, 0, 0, 0, 0};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
   auto const expect =
     fixed_width_column_wrapper<int32_t>{{XXX, XXX, XXX, XXX, XXX}, {0, 0, 0, 0, 0}};
 
@@ -841,9 +857,10 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxIncludeNulls)
                                                            {1, 1, 1, 1, 0, 1, 1, 0, 0, 0},
                                                            numeric::scale_type{scale});
     auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-    auto const d_offsets = thrust::device_vector<size_type>(offsets);
-    auto out_type        = column_view(input).type();
-    auto const expect    = fixed_point_column_wrapper<RepType>(
+    auto const d_offsets =
+      cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+    auto out_type     = column_view(input).type();
+    auto const expect = fixed_point_column_wrapper<RepType>(
       {3, XXX, 1, XXX, XXX, XXX}, {1, 0, 1, 0, 0, 0}, numeric::scale_type{scale});
 
     auto res = segmented_reduce(input,
@@ -873,9 +890,10 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxExcludeNulls)
                                                            {1, 1, 1, 1, 0, 1, 1, 0, 0, 0},
                                                            numeric::scale_type{scale});
     auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-    auto const d_offsets = thrust::device_vector<size_type>(offsets);
-    auto out_type        = column_view(input).type();
-    auto const expect    = fixed_point_column_wrapper<RepType>(
+    auto const d_offsets =
+      cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+    auto out_type     = column_view(input).type();
+    auto const expect = fixed_point_column_wrapper<RepType>(
       {3, 3, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}, numeric::scale_type{scale});
 
     auto res = segmented_reduce(input,
@@ -905,9 +923,10 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinIncludeNulls)
                                                            {1, 1, 1, 1, 0, 1, 1, 0, 0, 0},
                                                            numeric::scale_type{scale});
     auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-    auto const d_offsets = thrust::device_vector<size_type>(offsets);
-    auto out_type        = column_view(input).type();
-    auto const expect    = fixed_point_column_wrapper<RepType>(
+    auto const d_offsets =
+      cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+    auto out_type     = column_view(input).type();
+    auto const expect = fixed_point_column_wrapper<RepType>(
       {1, XXX, 1, XXX, XXX, XXX}, {1, 0, 1, 0, 0, 0}, numeric::scale_type{scale});
 
     auto res = segmented_reduce(input,
@@ -937,9 +956,10 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinExcludeNulls)
                                                            {1, 1, 1, 1, 0, 1, 1, 0, 0, 0},
                                                            numeric::scale_type{scale});
     auto const offsets = std::vector<size_type>{0, 3, 6, 7, 8, 10, 10};
-    auto const d_offsets = thrust::device_vector<size_type>(offsets);
-    auto out_type        = column_view(input).type();
-    auto const expect    = fixed_point_column_wrapper<RepType>(
+    auto const d_offsets =
+      cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+    auto out_type     = column_view(input).type();
+    auto const expect = fixed_point_column_wrapper<RepType>(
       {1, 1, 1, XXX, XXX, XXX}, {1, 1, 1, 0, 0, 0}, numeric::scale_type{scale});
 
     auto res = segmented_reduce(input,
@@ -966,9 +986,10 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MaxNonNullableInput)
   for (auto scale : {-2, 0, 5}) {
     auto const input =
       fixed_point_column_wrapper<RepType>({1, 2, 3, 1}, numeric::scale_type{scale});
-    auto const offsets   = std::vector<size_type>{0, 3, 4, 4};
-    auto const d_offsets = thrust::device_vector<size_type>(offsets);
-    auto out_type        = column_view(input).type();
+    auto const offsets = std::vector<size_type>{0, 3, 4, 4};
+    auto const d_offsets =
+      cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+    auto out_type = column_view(input).type();
     auto const expect =
       fixed_point_column_wrapper<RepType>({3, 1, XXX}, {1, 1, 0}, numeric::scale_type{scale});
 
@@ -1003,9 +1024,10 @@ TYPED_TEST(SegmentedReductionFixedPointTest, MinNonNullableInput)
   for (auto scale : {-2, 0, 5}) {
     auto const input =
       fixed_point_column_wrapper<RepType>({1, 2, 3, 1}, numeric::scale_type{scale});
-    auto const offsets   = std::vector<size_type>{0, 3, 4, 4};
-    auto const d_offsets = thrust::device_vector<size_type>(offsets);
-    auto out_type        = column_view(input).type();
+    auto const offsets = std::vector<size_type>{0, 3, 4, 4};
+    auto const d_offsets =
+      cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+    auto out_type = column_view(input).type();
     auto const expect =
       fixed_point_column_wrapper<RepType>({1, 1, XXX}, {1, 1, 0}, numeric::scale_type{scale});
 
@@ -1149,10 +1171,11 @@ TEST_F(SegmentedReductionStringTest, MinExcludeNulls)
 
 TEST_F(SegmentedReductionStringTest, EmptyInputWithOffsets)
 {
-  auto const input     = strings_column_wrapper{};
-  auto const offsets   = std::vector<size_type>{0, 0, 0, 0};
-  auto const d_offsets = thrust::device_vector<size_type>(offsets);
-  auto const expect    = strings_column_wrapper({XXX, XXX, XXX}, {0, 0, 0});
+  auto const input   = strings_column_wrapper{};
+  auto const offsets = std::vector<size_type>{0, 0, 0, 0};
+  auto const d_offsets =
+    cudf::detail::make_device_uvector_async(offsets, cudf::get_default_stream());
+  auto const expect = strings_column_wrapper({XXX, XXX, XXX}, {0, 0, 0});
 
   auto result = segmented_reduce(input,
                                  d_offsets,
diff --git a/cpp/tests/reductions/tdigest_tests.cu b/cpp/tests/reductions/tdigest_tests.cu
index 4a8f716c160..6e348b04f1c 100644
--- a/cpp/tests/reductions/tdigest_tests.cu
+++ b/cpp/tests/reductions/tdigest_tests.cu
@@ -35,7 +35,7 @@ struct reduce_op {
     // result is a scalar, but we want to extract out the underlying column
     auto scalar_result =
       cudf::reduce(values,
-                   cudf::make_tdigest_aggregation<cudf::reduce_aggregation>(delta),
+                   *cudf::make_tdigest_aggregation<cudf::reduce_aggregation>(delta),
                    cudf::data_type{cudf::type_id::STRUCT});
     auto tbl = static_cast<cudf::struct_scalar const*>(scalar_result.get())->view();
     std::vector<std::unique_ptr<cudf::column>> cols;
@@ -53,7 +53,7 @@ struct reduce_merge_op {
     // result is a scalar, but we want to extract out the underlying column
     auto scalar_result =
       cudf::reduce(values,
-                   cudf::make_merge_tdigest_aggregation<cudf::reduce_aggregation>(delta),
+                   *cudf::make_merge_tdigest_aggregation<cudf::reduce_aggregation>(delta),
                    cudf::data_type{cudf::type_id::STRUCT});
     auto tbl = static_cast<cudf::struct_scalar const*>(scalar_result.get())->view();
     std::vector<std::unique_ptr<cudf::column>> cols;
@@ -133,7 +133,7 @@ TEST_F(ReductionTDigestMerge, FewHeavyCentroids)
   // merge
   auto scalar_result =
     cudf::reduce(*values,
-                 cudf::make_merge_tdigest_aggregation<cudf::reduce_aggregation>(1000),
+                 *cudf::make_merge_tdigest_aggregation<cudf::reduce_aggregation>(1000),
                  cudf::data_type{cudf::type_id::STRUCT});
 
   // convert to a table
diff --git a/cpp/tests/replace/replace_nulls_tests.cpp b/cpp/tests/replace/replace_nulls_tests.cpp
index 9624ab52865..616ba9d2f64 100644
--- a/cpp/tests/replace/replace_nulls_tests.cpp
+++ b/cpp/tests/replace/replace_nulls_tests.cpp
@@ -21,7 +21,6 @@
 
 #include <tests/groupby/groupby_test_util.hpp>
 
-#include <cudf/dictionary/detail/replace.hpp>
 #include <cudf/dictionary/encode.hpp>
 #include <cudf/fixed_point/fixed_point.hpp>
 #include <cudf/scalar/scalar.hpp>
@@ -176,8 +175,8 @@ TEST_F(ReplaceNullsStringsTest, SimpleReplaceScalar)
   std::vector<std::string> input{"", "", "", "", "", "", "", ""};
   std::vector<cudf::valid_type> input_v{0, 0, 0, 0, 0, 0, 0, 0};
   std::unique_ptr<cudf::scalar> repl =
-    cudf::make_string_scalar("rep", cudf::default_stream_value, mr());
-  repl->set_valid_async(true, cudf::default_stream_value);
+    cudf::make_string_scalar("rep", cudf::get_default_stream(), mr());
+  repl->set_valid_async(true, cudf::get_default_stream());
   std::vector<std::string> expected{"rep", "rep", "rep", "rep", "rep", "rep", "rep", "rep"};
 
   cudf::test::strings_column_wrapper input_w{input.begin(), input.end(), input_v.begin()};
@@ -679,28 +678,25 @@ TEST_F(ReplaceDictionaryTest, ReplaceNullsError)
   auto input_one  = cudf::dictionary::encode(input_one_w);
   auto dict_input = cudf::dictionary_column_view(input_one->view());
   auto dict_repl  = cudf::dictionary_column_view(replacement->view());
-  EXPECT_THROW(cudf::dictionary::detail::replace_nulls(dict_input, dict_repl), cudf::logic_error);
+  EXPECT_THROW(cudf::replace_nulls(input->view(), replacement->view()), cudf::logic_error);
 }
 
 TEST_F(ReplaceDictionaryTest, ReplaceNullsEmpty)
 {
   cudf::test::fixed_width_column_wrapper<int64_t> input_empty_w({});
   auto input_empty = cudf::dictionary::encode(input_empty_w);
-  auto dict_input  = cudf::dictionary_column_view(input_empty->view());
-  auto result      = cudf::dictionary::detail::replace_nulls(dict_input, dict_input);
+  auto result      = cudf::replace_nulls(input_empty->view(), input_empty->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), input_empty->view());
 }
 
 TEST_F(ReplaceDictionaryTest, ReplaceNullsNoNulls)
 {
   cudf::test::fixed_width_column_wrapper<int8_t> input_w({1, 1, 1});
-  auto input      = cudf::dictionary::encode(input_w);
-  auto dict_input = cudf::dictionary_column_view(input->view());
-  auto result     = cudf::dictionary::detail::replace_nulls(dict_input, dict_input);
+  auto input  = cudf::dictionary::encode(input_w);
+  auto result = cudf::replace_nulls(input->view(), input->view());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), input->view());
 
-  result =
-    cudf::dictionary::detail::replace_nulls(dict_input, cudf::numeric_scalar<int64_t>(0, false));
+  result = cudf::replace_nulls(input->view(), cudf::numeric_scalar<int8_t>(0, false));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(result->view(), input->view());
 }
 
diff --git a/cpp/tests/reshape/byte_cast_tests.cpp b/cpp/tests/reshape/byte_cast_tests.cpp
index e5f3b8a1f7f..6eafc9a2759 100644
--- a/cpp/tests/reshape/byte_cast_tests.cpp
+++ b/cpp/tests/reshape/byte_cast_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -21,21 +21,20 @@
 #include <cudf/detail/iterator.cuh>
 #include <cudf/reshape.hpp>
 
-using namespace cudf::test;
-
 class ByteCastTest : public cudf::test::BaseFixture {
 };
 
 TEST_F(ByteCastTest, int16ValuesWithSplit)
 {
   using limits = std::numeric_limits<int16_t>;
-  fixed_width_column_wrapper<int16_t> const int16_col(
+  cudf::test::fixed_width_column_wrapper<int16_t> const int16_col(
     {short(0), short(100), short(-100), limits::min(), limits::max()});
-  lists_column_wrapper<uint8_t> const int16_expected(
+  cudf::test::lists_column_wrapper<uint8_t> const int16_expected(
     {{0x00, 0x00}, {0x64, 0x00}, {0x9c, 0xff}, {0x00, 0x80}, {0xff, 0x7f}});
-  lists_column_wrapper<uint8_t> const int16_expected_slice1(
+  cudf::test::lists_column_wrapper<uint8_t> const int16_expected_slice1(
     {{0x00, 0x00}, {0x00, 0x64}, {0xff, 0x9c}});
-  lists_column_wrapper<uint8_t> const int16_expected_slice2({{0x80, 0x00}, {0x7f, 0xff}});
+  cudf::test::lists_column_wrapper<uint8_t> const int16_expected_slice2(
+    {{0x80, 0x00}, {0x7f, 0xff}});
 
   std::vector<cudf::size_type> splits({3});
   std::vector<cudf::column_view> split_column = cudf::split(int16_col, splits);
@@ -54,23 +53,23 @@ TEST_F(ByteCastTest, int16ValuesWithNulls)
   auto odd_validity =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
-  fixed_width_column_wrapper<int16_t> const int16_col(
+  cudf::test::fixed_width_column_wrapper<int16_t> const int16_col(
     {short(0), short(100), short(-100), limits::min(), limits::max()}, {0, 1, 0, 1, 0});
   /* CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT compares underlying values even when specified as null,
    * resulting in erroneous test failures. The commented out data tests the case where underlying
    * values are different, but are both null. */
   // auto int16_data =
-  //   fixed_width_column_wrapper<uint8_t>{0xee, 0xff, 0x00, 0x64, 0xee, 0xff, 0x80, 0x00, 0xee,
-  //   0xff};
-  auto int16_data =
-    fixed_width_column_wrapper<uint8_t>{0x00, 0x00, 0x00, 0x64, 0xff, 0x9c, 0x80, 0x00, 0x7f, 0xff};
+  //   cudf::test::fixed_width_column_wrapper<uint8_t>{0xee, 0xff, 0x00, 0x64, 0xee, 0xff, 0x80,
+  //   0x00, 0xee, 0xff};
+  auto int16_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
+    0x00, 0x00, 0x00, 0x64, 0xff, 0x9c, 0x80, 0x00, 0x7f, 0xff};
 
   auto int16_expected = cudf::make_lists_column(
     5,
-    std::move(fixed_width_column_wrapper<cudf::size_type>{0, 2, 4, 6, 8, 10}.release()),
+    std::move(cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 2, 4, 6, 8, 10}.release()),
     std::move(int16_data.release()),
     3,
-    detail::make_null_mask(odd_validity, odd_validity + 5));
+    cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5));
 
   auto const output_int16 = cudf::byte_cast(int16_col, cudf::flip_endianness::YES);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output_int16->view(), int16_expected->view());
@@ -79,17 +78,19 @@ TEST_F(ByteCastTest, int16ValuesWithNulls)
 TEST_F(ByteCastTest, int32Values)
 {
   using limits = std::numeric_limits<int32_t>;
-  fixed_width_column_wrapper<int32_t> const int32_col({0, 100, -100, limits::min(), limits::max()});
-  lists_column_wrapper<uint8_t> const int32_expected_flipped({{0x00, 0x00, 0x00, 0x00},
-                                                              {0x00, 0x00, 0x00, 0x64},
-                                                              {0xff, 0xff, 0xff, 0x9c},
-                                                              {0x80, 0x00, 0x00, 0x00},
-                                                              {0x7f, 0xff, 0xff, 0xff}});
-  lists_column_wrapper<uint8_t> const int32_expected({{0x00, 0x00, 0x00, 0x00},
-                                                      {0x64, 0x00, 0x00, 0x00},
-                                                      {0x9c, 0xff, 0xff, 0xff},
-                                                      {0x00, 0x00, 0x00, 0x80},
-                                                      {0xff, 0xff, 0xff, 0x7f}});
+  cudf::test::fixed_width_column_wrapper<int32_t> const int32_col(
+    {0, 100, -100, limits::min(), limits::max()});
+  cudf::test::lists_column_wrapper<uint8_t> const int32_expected_flipped(
+    {{0x00, 0x00, 0x00, 0x00},
+     {0x00, 0x00, 0x00, 0x64},
+     {0xff, 0xff, 0xff, 0x9c},
+     {0x80, 0x00, 0x00, 0x00},
+     {0x7f, 0xff, 0xff, 0xff}});
+  cudf::test::lists_column_wrapper<uint8_t> const int32_expected({{0x00, 0x00, 0x00, 0x00},
+                                                                  {0x64, 0x00, 0x00, 0x00},
+                                                                  {0x9c, 0xff, 0xff, 0xff},
+                                                                  {0x00, 0x00, 0x00, 0x80},
+                                                                  {0xff, 0xff, 0xff, 0x7f}});
 
   auto const output_int32_flipped = cudf::byte_cast(int32_col, cudf::flip_endianness::YES);
   auto const output_int32         = cudf::byte_cast(int32_col, cudf::flip_endianness::NO);
@@ -103,23 +104,24 @@ TEST_F(ByteCastTest, int32ValuesWithNulls)
   auto even_validity =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i + 1) % 2; });
 
-  fixed_width_column_wrapper<int32_t> const int32_col({0, 100, -100, limits::min(), limits::max()},
-                                                      {1, 0, 1, 0, 1});
+  cudf::test::fixed_width_column_wrapper<int32_t> const int32_col(
+    {0, 100, -100, limits::min(), limits::max()}, {1, 0, 1, 0, 1});
   /* Data commented out below explained by comment in int16ValuesWithNulls test */
   // auto int32_data =
-  //   fixed_width_column_wrapper<uint8_t>{0x00, 0x00, 0x00, 0x00, 0xcc, 0xdd, 0xee, 0xff, 0xff,
-  //   0xff,
+  //   cudf::test::fixed_width_column_wrapper<uint8_t>{0x00, 0x00, 0x00, 0x00, 0xcc, 0xdd, 0xee,
+  //   0xff, 0xff, 0xff,
   //                                       0xff, 0x9c, 0xcc, 0xdd, 0xee, 0xff, 0x7f, 0xff, 0xff,
   //                                       0xff};
-  auto int32_data =
-    fixed_width_column_wrapper<uint8_t>{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0xff, 0xff,
-                                        0xff, 0x9c, 0x80, 0x00, 0x00, 0x00, 0x7f, 0xff, 0xff, 0xff};
+  auto int32_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
+    0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64, 0xff, 0xff,
+    0xff, 0x9c, 0x80, 0x00, 0x00, 0x00, 0x7f, 0xff, 0xff, 0xff};
   auto int32_expected = cudf::make_lists_column(
     5,
-    std::move(fixed_width_column_wrapper<cudf::size_type>{0, 4, 8, 12, 16, 20}.release()),
+    std::move(
+      cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 8, 12, 16, 20}.release()),
     std::move(int32_data.release()),
     2,
-    detail::make_null_mask(even_validity, even_validity + 5));
+    cudf::test::detail::make_null_mask(even_validity, even_validity + 5));
 
   auto const output_int32 = cudf::byte_cast(int32_col, cudf::flip_endianness::YES);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output_int32->view(), int32_expected->view());
@@ -128,19 +130,19 @@ TEST_F(ByteCastTest, int32ValuesWithNulls)
 TEST_F(ByteCastTest, int64ValuesWithSplit)
 {
   using limits = std::numeric_limits<int64_t>;
-  fixed_width_column_wrapper<int64_t> const int64_col(
+  cudf::test::fixed_width_column_wrapper<int64_t> const int64_col(
     {long(0), long(100), long(-100), limits::min(), limits::max()});
-  lists_column_wrapper<uint8_t> const int64_expected_flipped(
+  cudf::test::lists_column_wrapper<uint8_t> const int64_expected_flipped(
     {{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
      {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x64},
      {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x9c},
      {0x80, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
      {0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}});
-  lists_column_wrapper<uint8_t> const int64_expected_slice1(
+  cudf::test::lists_column_wrapper<uint8_t> const int64_expected_slice1(
     {{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
      {0x64, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
      {0x9c, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff}});
-  lists_column_wrapper<uint8_t> const int64_expected_slice2(
+  cudf::test::lists_column_wrapper<uint8_t> const int64_expected_slice2(
     {{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x80},
      {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x7f}});
 
@@ -161,23 +163,24 @@ TEST_F(ByteCastTest, int64ValuesWithNulls)
   auto odd_validity =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
-  fixed_width_column_wrapper<int64_t> const int64_col(
+  cudf::test::fixed_width_column_wrapper<int64_t> const int64_col(
     {long(0), long(100), long(-100), limits::min(), limits::max()}, {0, 1, 0, 1, 0});
   /* Data commented out below explained by comment in int16ValuesWithNulls test */
-  // auto int64_data = fixed_width_column_wrapper<uint8_t>{
+  // auto int64_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
   //   0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
   //   0x00, 0x64, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x80, 0x00, 0x00, 0x00,
   //   0x00, 0x00, 0x00, 0x00, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff};
-  auto int64_data = fixed_width_column_wrapper<uint8_t>{
+  auto int64_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x64, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0x9c, 0x80, 0x00, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x7f, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   auto int64_expected = cudf::make_lists_column(
     5,
-    std::move(fixed_width_column_wrapper<cudf::size_type>{0, 8, 16, 24, 32, 40}.release()),
+    std::move(
+      cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 8, 16, 24, 32, 40}.release()),
     std::move(int64_data.release()),
     3,
-    detail::make_null_mask(odd_validity, odd_validity + 5));
+    cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5));
 
   auto const output_int64 = cudf::byte_cast(int64_col, cudf::flip_endianness::YES);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output_int64->view(), int64_expected->view());
@@ -188,26 +191,26 @@ TEST_F(ByteCastTest, fp32ValuesWithSplit)
   using limits = std::numeric_limits<float>;
   float nan    = limits::quiet_NaN();
   float inf    = limits::infinity();
-  fixed_width_column_wrapper<float> const fp32_col(
+  cudf::test::fixed_width_column_wrapper<float> const fp32_col(
     {float(0.0), float(100.0), float(-100.0), limits::min(), limits::max(), nan, -nan, inf, -inf});
-  lists_column_wrapper<uint8_t> const fp32_expected({{0x00, 0x00, 0x00, 0x00},
-                                                     {0x00, 0x00, 0xc8, 0x42},
-                                                     {0x00, 0x00, 0xc8, 0xc2},
-                                                     {0x00, 0x00, 0x80, 0x00},
-                                                     {0xff, 0xff, 0x7f, 0x7f},
-                                                     {0x00, 0x00, 0xc0, 0x7f},
-                                                     {0x00, 0x00, 0xc0, 0xff},
-                                                     {0x00, 0x00, 0x80, 0x7f},
-                                                     {0x00, 0x00, 0x80, 0xff}});
-  lists_column_wrapper<uint8_t> const fp32_expected_slice1({{0x00, 0x00, 0x00, 0x00},
-                                                            {0x42, 0xc8, 0x00, 0x00},
-                                                            {0xc2, 0xc8, 0x00, 0x00},
-                                                            {0x00, 0x80, 0x00, 0x00},
-                                                            {0x7f, 0x7f, 0xff, 0xff}});
-  lists_column_wrapper<uint8_t> const fp32_expected_slice2({{0x7f, 0xc0, 0x00, 0x00},
-                                                            {0xff, 0xc0, 0x00, 0x00},
-                                                            {0x7f, 0x80, 0x00, 0x00},
-                                                            {0xff, 0x80, 0x00, 0x00}});
+  cudf::test::lists_column_wrapper<uint8_t> const fp32_expected({{0x00, 0x00, 0x00, 0x00},
+                                                                 {0x00, 0x00, 0xc8, 0x42},
+                                                                 {0x00, 0x00, 0xc8, 0xc2},
+                                                                 {0x00, 0x00, 0x80, 0x00},
+                                                                 {0xff, 0xff, 0x7f, 0x7f},
+                                                                 {0x00, 0x00, 0xc0, 0x7f},
+                                                                 {0x00, 0x00, 0xc0, 0xff},
+                                                                 {0x00, 0x00, 0x80, 0x7f},
+                                                                 {0x00, 0x00, 0x80, 0xff}});
+  cudf::test::lists_column_wrapper<uint8_t> const fp32_expected_slice1({{0x00, 0x00, 0x00, 0x00},
+                                                                        {0x42, 0xc8, 0x00, 0x00},
+                                                                        {0xc2, 0xc8, 0x00, 0x00},
+                                                                        {0x00, 0x80, 0x00, 0x00},
+                                                                        {0x7f, 0x7f, 0xff, 0xff}});
+  cudf::test::lists_column_wrapper<uint8_t> const fp32_expected_slice2({{0x7f, 0xc0, 0x00, 0x00},
+                                                                        {0xff, 0xc0, 0x00, 0x00},
+                                                                        {0x7f, 0x80, 0x00, 0x00},
+                                                                        {0xff, 0x80, 0x00, 0x00}});
 
   std::vector<cudf::size_type> splits({5});
   std::vector<cudf::column_view> split_column = cudf::split(fp32_col, splits);
@@ -226,23 +229,24 @@ TEST_F(ByteCastTest, fp32ValuesWithNulls)
   auto even_validity =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return (i + 1) % 2; });
 
-  fixed_width_column_wrapper<float> const fp32_col(
+  cudf::test::fixed_width_column_wrapper<float> const fp32_col(
     {float(0.0), float(100.0), float(-100.0), limits::min(), limits::max()}, {1, 0, 1, 0, 1});
   /* Data commented out below explained by comment in int16ValuesWithNulls test */
   // auto fp32_data =
-  //   fixed_width_column_wrapper<uint8_t>{0x00, 0x00, 0x00, 0x00, 0xcc, 0xdd, 0xee, 0xff, 0xc2,
-  //   0xc8,
+  //   cudf::test::fixed_width_column_wrapper<uint8_t>{0x00, 0x00, 0x00, 0x00, 0xcc, 0xdd, 0xee,
+  //   0xff, 0xc2, 0xc8,
   //                                       0x00, 0x00, 0xcc, 0xdd, 0xee, 0xff, 0x7f, 0x7f, 0xff,
   //                                       0xff};
-  auto fp32_data =
-    fixed_width_column_wrapper<uint8_t>{0x00, 0x00, 0x00, 0x00, 0x42, 0xc8, 0x00, 0x00, 0xc2, 0xc8,
-                                        0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x7f, 0x7f, 0xff, 0xff};
+  auto fp32_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
+    0x00, 0x00, 0x00, 0x00, 0x42, 0xc8, 0x00, 0x00, 0xc2, 0xc8,
+    0x00, 0x00, 0x00, 0x80, 0x00, 0x00, 0x7f, 0x7f, 0xff, 0xff};
   auto fp32_expected = cudf::make_lists_column(
     5,
-    std::move(fixed_width_column_wrapper<cudf::size_type>{0, 4, 8, 12, 16, 20}.release()),
+    std::move(
+      cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 4, 8, 12, 16, 20}.release()),
     std::move(fp32_data.release()),
     2,
-    detail::make_null_mask(even_validity, even_validity + 5));
+    cudf::test::detail::make_null_mask(even_validity, even_validity + 5));
 
   auto const output_fp32 = cudf::byte_cast(fp32_col, cudf::flip_endianness::YES);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output_fp32->view(), fp32_expected->view());
@@ -253,16 +257,16 @@ TEST_F(ByteCastTest, fp64ValuesWithSplit)
   using limits = std::numeric_limits<double>;
   double nan   = limits::quiet_NaN();
   double inf   = limits::infinity();
-  fixed_width_column_wrapper<double> const fp64_col({double(0.0),
-                                                     double(100.0),
-                                                     double(-100.0),
-                                                     limits::min(),
-                                                     limits::max(),
-                                                     nan,
-                                                     -nan,
-                                                     inf,
-                                                     -inf});
-  lists_column_wrapper<uint8_t> const fp64_flipped_expected(
+  cudf::test::fixed_width_column_wrapper<double> const fp64_col({double(0.0),
+                                                                 double(100.0),
+                                                                 double(-100.0),
+                                                                 limits::min(),
+                                                                 limits::max(),
+                                                                 nan,
+                                                                 -nan,
+                                                                 inf,
+                                                                 -inf});
+  cudf::test::lists_column_wrapper<uint8_t> const fp64_flipped_expected(
     {{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
      {0x40, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
      {0xc0, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
@@ -272,13 +276,13 @@ TEST_F(ByteCastTest, fp64ValuesWithSplit)
      {0xff, 0xf8, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
      {0x7f, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
      {0xff, 0xf0, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}});
-  lists_column_wrapper<uint8_t> const fp64_expected_slice1(
+  cudf::test::lists_column_wrapper<uint8_t> const fp64_expected_slice1(
     {{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00},
      {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0x40},
      {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x59, 0xc0},
      {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00},
      {0xff, 0xff, 0xff, 0xff, 0xff, 0xff, 0xef, 0x7f}});
-  lists_column_wrapper<uint8_t> const fp64_expected_slice2(
+  cudf::test::lists_column_wrapper<uint8_t> const fp64_expected_slice2(
     {{0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0x7f},
      {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf8, 0xff},
      {0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0xf0, 0x7f},
@@ -301,23 +305,24 @@ TEST_F(ByteCastTest, fp64ValuesWithNulls)
   auto odd_validity =
     cudf::detail::make_counting_transform_iterator(0, [](auto i) { return i % 2; });
 
-  fixed_width_column_wrapper<double> const fp64_col(
+  cudf::test::fixed_width_column_wrapper<double> const fp64_col(
     {double(0.0), double(100.0), double(-100.0), limits::min(), limits::max()}, {0, 1, 0, 1, 0});
   /* Data commented out below explained by comment in int16ValuesWithNulls test */
-  // auto fp64_data = fixed_width_column_wrapper<uint8_t>{
+  // auto fp64_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
   //   0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x40, 0x59, 0x00, 0x00, 0x00, 0x00,
   //   0x00, 0x00, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff, 0x00, 0x10, 0x00, 0x00,
   //   0x00, 0x00, 0x00, 0x00, 0x88, 0x99, 0xaa, 0xbb, 0xcc, 0xdd, 0xee, 0xff};
-  auto fp64_data = fixed_width_column_wrapper<uint8_t>{
+  auto fp64_data = cudf::test::fixed_width_column_wrapper<uint8_t>{
     0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x40, 0x59, 0x00, 0x00, 0x00, 0x00,
     0x00, 0x00, 0xc0, 0x59, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x00, 0x10, 0x00, 0x00,
     0x00, 0x00, 0x00, 0x00, 0x7f, 0xef, 0xff, 0xff, 0xff, 0xff, 0xff, 0xff};
   auto fp64_expected = cudf::make_lists_column(
     5,
-    std::move(fixed_width_column_wrapper<cudf::size_type>{0, 8, 16, 24, 32, 40}.release()),
+    std::move(
+      cudf::test::fixed_width_column_wrapper<cudf::size_type>{0, 8, 16, 24, 32, 40}.release()),
     std::move(fp64_data.release()),
     3,
-    detail::make_null_mask(odd_validity, odd_validity + 5));
+    cudf::test::detail::make_null_mask(odd_validity, odd_validity + 5));
 
   auto const output_fp64 = cudf::byte_cast(fp64_col, cudf::flip_endianness::YES);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output_fp64->view(), fp64_expected->view());
@@ -325,9 +330,9 @@ TEST_F(ByteCastTest, fp64ValuesWithNulls)
 
 TEST_F(ByteCastTest, StringValues)
 {
-  strings_column_wrapper const strings_col(
+  cudf::test::strings_column_wrapper const strings_col(
     {"", "The quick", " brown fox...", "!\"#$%&\'()*+,-./", "0123456789:;<=>?@", "[\\]^_`{|}~"});
-  lists_column_wrapper<int8_t> const strings_expected(
+  cudf::test::lists_column_wrapper<uint8_t> const strings_expected(
     {{},
      {0x54, 0x68, 0x65, 0x20, 0x71, 0x75, 0x69, 0x63, 0x6b},
      {0x20, 0x62, 0x72, 0x6f, 0x77, 0x6e, 0x20, 0x66, 0x6f, 0x78, 0x2e, 0x2e, 0x2e},
diff --git a/cpp/tests/reshape/interleave_columns_tests.cpp b/cpp/tests/reshape/interleave_columns_tests.cpp
index c682e4ab29f..63e465f7658 100644
--- a/cpp/tests/reshape/interleave_columns_tests.cpp
+++ b/cpp/tests/reshape/interleave_columns_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_factories.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/iterator_utilities.hpp>
 #include <cudf_test/type_lists.hpp>
-#include <tests/strings/utilities.h>
 
+#include <cudf/column/column_factories.hpp>
 #include <cudf/reshape.hpp>
 
 using namespace cudf::test::iterators;
@@ -195,7 +194,7 @@ TEST_F(InterleaveStringsColumnsTest, ZeroSizedColumns)
   cudf::column_view col0(cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
 
   auto results = cudf::interleave_columns(cudf::table_view{{col0}});
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(InterleaveStringsColumnsTest, SingleColumn)
diff --git a/cpp/tests/reshape/tile_tests.cpp b/cpp/tests/reshape/tile_tests.cpp
index e605fd7a84b..86dcc431633 100644
--- a/cpp/tests/reshape/tile_tests.cpp
+++ b/cpp/tests/reshape/tile_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -24,10 +24,8 @@
 #include <cudf/table/table.hpp>
 #include <cudf/utilities/error.hpp>
 
-using namespace cudf::test;
-
 template <typename T>
-struct TileTest : public BaseFixture {
+struct TileTest : public cudf::test::BaseFixture {
 };
 
 TYPED_TEST_SUITE(TileTest, cudf::test::AllTypes);
@@ -47,7 +45,7 @@ TYPED_TEST(TileTest, NoRows)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> in_a({});
+  cudf::test::fixed_width_column_wrapper<T> in_a({});
   cudf::table_view in(std::vector<cudf::column_view>{in_a});
 
   auto expected = in;
@@ -61,10 +59,10 @@ TYPED_TEST(TileTest, OneColumn)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T, int32_t> in_a({-1, 0, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> in_a({-1, 0, 1});
   cudf::table_view in(std::vector<cudf::column_view>{in_a});
 
-  fixed_width_column_wrapper<T, int32_t> expected_a({-1, 0, 1, -1, 0, 1});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> expected_a({-1, 0, 1, -1, 0, 1});
   cudf::table_view expected(std::vector<cudf::column_view>{expected_a});
 
   auto actual = cudf::tile(in, 2);
@@ -76,10 +74,11 @@ TYPED_TEST(TileTest, OneColumnNullable)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T, int32_t> in_a({-1, 0, 1}, {1, 0, 0});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> in_a({-1, 0, 1}, {1, 0, 0});
   cudf::table_view in(std::vector<cudf::column_view>{in_a});
 
-  fixed_width_column_wrapper<T, int32_t> expected_a({-1, 0, 1, -1, 0, 1}, {1, 0, 0, 1, 0, 0});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> expected_a({-1, 0, 1, -1, 0, 1},
+                                                                {1, 0, 0, 1, 0, 0});
   cudf::table_view expected(std::vector<cudf::column_view>{expected_a});
 
   auto actual = cudf::tile(in, 2);
@@ -91,7 +90,7 @@ TYPED_TEST(TileTest, OneColumnNegativeCount)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T, int32_t> in_a({-1, 0, 1}, {1, 0, 0});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> in_a({-1, 0, 1}, {1, 0, 0});
   cudf::table_view in(std::vector<cudf::column_view>{in_a});
 
   EXPECT_THROW(cudf::tile(in, -1), cudf::logic_error);
@@ -101,13 +100,13 @@ TYPED_TEST(TileTest, OneColumnZeroCount)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T, int32_t> in_a({-1, 0, 1}, {1, 0, 0});
+  cudf::test::fixed_width_column_wrapper<T, int32_t> in_a({-1, 0, 1}, {1, 0, 0});
   cudf::table_view in(std::vector<cudf::column_view>{in_a});
 
   std::vector<T> vals{};
   std::vector<bool> mask{};
 
-  fixed_width_column_wrapper<T> expected_a(vals.begin(), vals.end(), mask.begin());
+  cudf::test::fixed_width_column_wrapper<T> expected_a(vals.begin(), vals.end(), mask.begin());
 
   cudf::table_view expected(std::vector<cudf::column_view>{expected_a});
 
diff --git a/cpp/tests/rolling/collect_ops_test.cpp b/cpp/tests/rolling/collect_ops_test.cpp
index 9dc13b2f9f7..8a396d793a3 100644
--- a/cpp/tests/rolling/collect_ops_test.cpp
+++ b/cpp/tests/rolling/collect_ops_test.cpp
@@ -2118,13 +2118,14 @@ TEST_F(CollectSetTest, FloatGroupedRollingWindowWithNaNs)
   auto const following   = 1;
   auto const min_periods = 1;
   // test on nan_equality::UNEQUAL
-  auto const result =
-    grouped_rolling_collect_set(table_view{std::vector<column_view>{group_column}},
-                                input_column,
-                                preceding,
-                                following,
-                                min_periods,
-                                *make_collect_set_aggregation<rolling_aggregation>());
+  auto const result = grouped_rolling_collect_set(
+    table_view{std::vector<column_view>{group_column}},
+    input_column,
+    preceding,
+    following,
+    min_periods,
+    *make_collect_set_aggregation<rolling_aggregation>(
+      null_policy::INCLUDE, null_equality::EQUAL, nan_equality::UNEQUAL));
 
   auto const expected_result = lists_column_wrapper<double>{
     {{0.2341, 1.23}, std::initializer_list<bool>{true, true}},
@@ -2186,7 +2187,8 @@ TEST_F(CollectSetTest, BasicRollingWindowWithNaNs)
                         prev_column,
                         foll_column,
                         1,
-                        *make_collect_set_aggregation<rolling_aggregation>());
+                        *make_collect_set_aggregation<rolling_aggregation>(
+                          null_policy::INCLUDE, null_equality::EQUAL, nan_equality::UNEQUAL));
 
   auto const expected_result =
     lists_column_wrapper<double>{
@@ -2200,8 +2202,13 @@ TEST_F(CollectSetTest, BasicRollingWindowWithNaNs)
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_column_based_window->view());
 
-  auto const result_fixed_window = rolling_collect_set(
-    input_column, 2, 1, 1, *make_collect_set_aggregation<rolling_aggregation>());
+  auto const result_fixed_window =
+    rolling_collect_set(input_column,
+                        2,
+                        1,
+                        1,
+                        *make_collect_set_aggregation<rolling_aggregation>(
+                          null_policy::INCLUDE, null_equality::EQUAL, nan_equality::UNEQUAL));
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_fixed_window->view());
 
   auto const result_with_nulls_excluded =
@@ -2209,7 +2216,8 @@ TEST_F(CollectSetTest, BasicRollingWindowWithNaNs)
                         2,
                         1,
                         1,
-                        *make_collect_set_aggregation<rolling_aggregation>(null_policy::EXCLUDE));
+                        *make_collect_set_aggregation<rolling_aggregation>(
+                          null_policy::EXCLUDE, null_equality::EQUAL, nan_equality::UNEQUAL));
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(expected_result->view(), result_with_nulls_excluded->view());
 
diff --git a/cpp/tests/rolling/nth_element_test.cpp b/cpp/tests/rolling/nth_element_test.cpp
index 93276abbbb2..313a434dfa4 100644
--- a/cpp/tests/rolling/nth_element_test.cpp
+++ b/cpp/tests/rolling/nth_element_test.cpp
@@ -250,7 +250,7 @@ TYPED_TEST(NthElementTypedTest, GroupedRollingWindow)
   auto const group_col = fwcw<int32_t>{0, 0, 0, 0, 0, 0,
                                        10, 10, 10, 10, 10, 10, 10,
                                        20};
-  auto const input_col = fwcw<T> {0, 1, 2, 3, 4, 5,           // Group 0 
+  auto const input_col = fwcw<T> {0, 1, 2, 3, 4, 5,           // Group 0
                                   10, 11, 12, 13, 14, 15, 16, // Group 10
                                   20};                        // Group 20
   // clang-format on
@@ -267,16 +267,16 @@ TYPED_TEST(NthElementTypedTest, GroupedRollingWindow)
                                                  20},                        // Group 20
                                                 no_nulls()});
     auto const last_element = tester.test_grouped_nth_element(-1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*last_element, 
-                                        fwcw<T>{{2, 3, 4, 5, 5, 5,           // Group 0 
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*last_element,
+                                        fwcw<T>{{2, 3, 4, 5, 5, 5,           // Group 0
                                                  12, 13, 14, 15, 16, 16, 16, // Group 10
-                                                 20},                        // Group 20 
+                                                 20},                        // Group 20
                                                 no_nulls()});
     auto const third_element = tester.test_grouped_nth_element(2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*third_element, 
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*third_element,
                                         fwcw<T>{{2, 2, 2, 3, 4, 5,           // Group 0
                                                  12, 12, 12, 13, 14, 15, 16, // Group 10
-                                                 X},                         // Group 20                     
+                                                 X},                         // Group 20
                                                 null_at(13)});
     auto const second_last_element = tester.test_grouped_nth_element(-2);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*second_last_element,
@@ -292,27 +292,27 @@ TYPED_TEST(NthElementTypedTest, GroupedRollingWindow)
     auto const first_element = tester.test_grouped_nth_element(0);
     // clang-format off
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*first_element,
-                                        fwcw<T>{{X, 0, 1, 2, 3, X,         // Group 0 
+                                        fwcw<T>{{X, 0, 1, 2, 3, X,         // Group 0
                                                  X, 10, 11, 12, 13, 14, X, // Group 10
                                                  X},                       // Group 20
                                                 nulls_at({0, 5, 6, 12, 13})});
     auto const last_element = tester.test_grouped_nth_element(-1);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*last_element,
-                                        fwcw<T>{{X, 2, 3, 4, 5, X,         // Group 0 
+                                        fwcw<T>{{X, 2, 3, 4, 5, X,         // Group 0
                                                  X, 12, 13, 14, 15, 16, X, // Group 10
-                                                 X},                       // Group 20 
+                                                 X},                       // Group 20
                                                 nulls_at({0, 5, 6, 12, 13})});
     auto const second_element = tester.test_grouped_nth_element(1);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*second_element,
-                                        fwcw<T>{{X, 1, 2, 3, 4, X,         // Group 0 
+                                        fwcw<T>{{X, 1, 2, 3, 4, X,         // Group 0
                                                  X, 11, 12, 13, 14, 15, X, // Group 10
-                                                 X},                       // Group 20 
+                                                 X},                       // Group 20
                                                 nulls_at({0, 5, 6, 12, 13})});
     auto const second_last_element = tester.test_grouped_nth_element(-2);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*second_last_element,
                                         fwcw<T>{{X, 1, 2, 3, 4, X,         // Group 0
                                                  X, 11, 12, 13, 14, 15, X, // Group 10
-                                                 X},                       // Group 20               
+                                                 X},                       // Group 20
                                                 nulls_at({0, 5, 6, 12, 13})});
     // clang-format on
   }
@@ -341,7 +341,7 @@ TYPED_TEST(NthElementTypedTest, GroupedRollingWindowExcludeNulls)
                                        10, 10, 10, 10, 10, 10, 10,
                                        20,
                                        30};
-  auto const input_col = fwcw<T> {{0, 1, X, 3, X, 5,         // Group 0 
+  auto const input_col = fwcw<T> {{0, 1, X, 3, X, 5,         // Group 0
                                    10, X, X, 13, 14, 15, 16, // Group 10
                                    20,                       // Group 20
                                    X},                       // Group 30
@@ -361,25 +361,25 @@ TYPED_TEST(NthElementTypedTest, GroupedRollingWindowExcludeNulls)
                                                  X},                         // Group 30
                                                 null_at(14)});
     auto const last_element = tester.test_grouped_nth_element(-1);
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*last_element, 
-                                        fwcw<T>{{1, 3, 3, 5, 5, 5,           // Group 0 
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*last_element,
+                                        fwcw<T>{{1, 3, 3, 5, 5, 5,           // Group 0
                                                  10, 13, 14, 15, 16, 16, 16, // Group 10
-                                                 20,                         // Group 20 
-                                                 X},                         // Group 30 
+                                                 20,                         // Group 20
+                                                 X},                         // Group 30
                                                 null_at(14)});
     auto const third_element = tester.test_grouped_nth_element(2);
-    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*third_element, 
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*third_element,
                                         fwcw<T>{{X, 3, 3, 5, X, X,          // Group 0
                                                  X, X, 14, 15, 15, 15, 16,  // Group 10
-                                                 X,                         // Group 20 
-                                                 X},                        // Group 30                     
+                                                 X,                         // Group 20
+                                                 X},                        // Group 30
                                                 nulls_at({0, 4, 5, 6, 7, 13, 14})});
     auto const second_last_element = tester.test_grouped_nth_element(-2);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*second_last_element,
                                         fwcw<T>{{0, 1, 1, 3, 3, 3,          // Group 0
                                                  X, 10, 13, 14, 15, 15, 15, // Group 10
                                                  X,                         // Group 20
-                                                 X},                        // Group 30                     
+                                                 X},                        // Group 30
                                                 nulls_at({6, 13, 14})});
     // clang-format on
   }
@@ -389,30 +389,30 @@ TYPED_TEST(NthElementTypedTest, GroupedRollingWindowExcludeNulls)
     auto const first_element = tester.test_grouped_nth_element(0);
     // clang-format off
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*first_element,
-                                        fwcw<T>{{X, 0, 1, 3, 3, X,         // Group 0 
+                                        fwcw<T>{{X, 0, 1, 3, 3, X,         // Group 0
                                                  X, 10, 13, 13, 13, 14, X, // Group 10
                                                  X,                        // Group 20
                                                  X},                       // Group 30
                                                 nulls_at({0, 5, 6, 12, 13, 14})});
     auto const last_element = tester.test_grouped_nth_element(-1);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*last_element,
-                                        fwcw<T>{{X, 1, 3, 3, 5, X,         // Group 0 
+                                        fwcw<T>{{X, 1, 3, 3, 5, X,         // Group 0
                                                  X, 10, 13, 14, 15, 16, X, // Group 10
-                                                 X,                        // Group 20 
+                                                 X,                        // Group 20
                                                  X},                       // Group 30
                                                 nulls_at({0, 5, 6, 12, 13, 14})});
     auto const second_element = tester.test_grouped_nth_element(1);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*second_element,
-                                        fwcw<T>{{X, 1, 3, X, 5, X,       // Group 0 
+                                        fwcw<T>{{X, 1, 3, X, 5, X,       // Group 0
                                                  X, X, X, 14, 14, 15, X, // Group 10
-                                                 X,                      // Group 20 
+                                                 X,                      // Group 20
                                                  X},                     // Group 30
                                                 nulls_at({0, 3, 5, 6, 7, 8, 12, 13, 14})});
     auto const second_last_element = tester.test_grouped_nth_element(-2);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*second_last_element,
                                         fwcw<T>{{X, 0, 1, X, 3, X,       // Group 0
                                                  X, X, X, 13, 14, 15, X, // Group 10
-                                                 X,                      // Group 20               
+                                                 X,                      // Group 20
                                                  X},                     // Group 30
                                                 nulls_at({0, 3, 5, 6, 7, 8, 12, 13, 14})});
     // clang-format on
@@ -526,11 +526,11 @@ TEST_F(NthElementTest, GroupedRollingWindowForStrings)
   auto constexpr X = "";  // Placeholder for null strings.
 
   // clang-format off
-  auto const group_col = fwcw<int32_t>{0, 0, 0, 0, 0, 0,  
-                                       10, 10, 10, 10, 10, 10, 10,  
+  auto const group_col = fwcw<int32_t>{0, 0, 0, 0, 0, 0,
+                                       10, 10, 10, 10, 10, 10, 10,
                                        20};
   auto const input_col = strings{{"", "1", "22", "333", "4444", X,          // Group 0
-                                  "10", "11", "12", "13", "14", "15", "16", // Group 10 
+                                  "10", "11", "12", "13", "14", "15", "16", // Group 10
                                   "20"},                                    // Group 20
                                  null_at(5)};
   // clang-format on
@@ -543,22 +543,22 @@ TEST_F(NthElementTest, GroupedRollingWindowForStrings)
     auto const first_element = tester.test_grouped_nth_element(0);
     // clang-format off
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
-      *first_element, 
-      strings{{"", "", "", "1", "22", "333",             // Group 0 
+      *first_element,
+      strings{{"", "", "", "1", "22", "333",             // Group 0
                "10", "10", "10", "11", "12", "13", "14", // Group 10
-               "20"},                                    // Group 20 
+               "20"},                                    // Group 20
               no_nulls()});
     auto const last_element = tester.test_grouped_nth_element(-1);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
-      *last_element, 
-      strings{{"22", "333", "4444", X, X, X,             // Group 0 
+      *last_element,
+      strings{{"22", "333", "4444", X, X, X,             // Group 0
                "12", "13", "14", "15", "16", "16", "16", // Group 10
-               "20"},                                    // Group 20 
+               "20"},                                    // Group 20
               nulls_at({3, 4, 5})});
     auto const third_element = tester.test_grouped_nth_element(2);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
-      *third_element, 
-      strings{{"22", "22", "22", "333", "4444", X,       // Group 0 
+      *third_element,
+      strings{{"22", "22", "22", "333", "4444", X,       // Group 0
                "12", "12", "12", "13", "14", "15", "16", // Group 10
                X},                                       // Group 20
               nulls_at({5, 13})});
@@ -567,7 +567,7 @@ TEST_F(NthElementTest, GroupedRollingWindowForStrings)
       *second_last_element,
       strings{{"1", "22", "333", "4444", "4444", "4444", // Group 0
                "11", "12", "13", "14", "15", "15", "15", // Group 10
-               X},                                       // Group 20 
+               X},                                       // Group 20
               null_at(13)});
     // clang-format on
   }
@@ -578,30 +578,30 @@ TEST_F(NthElementTest, GroupedRollingWindowForStrings)
     // clang-format off
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
       *first_element,
-      strings{{X, "", "1", "22", "333", X,         // Group 0 
+      strings{{X, "", "1", "22", "333", X,         // Group 0
                X, "10", "11", "12", "13", "14", X, // Group 10
-               X},                                 // Group 20 
+               X},                                 // Group 20
               nulls_at({0, 5, 6, 12, 13})});
     auto const last_element = tester.test_grouped_nth_element(-1);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
       *last_element,
-      strings{{X, "22", "333", "4444", X, X,       // Group 0 
+      strings{{X, "22", "333", "4444", X, X,       // Group 0
                X, "12", "13", "14", "15", "16", X, // Group 10
-               X},                                 // Group 20 
+               X},                                 // Group 20
               nulls_at({0, 4, 5, 6, 12, 13})});
     auto const second_element = tester.test_grouped_nth_element(1);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
       *second_element,
-      strings{{X, "1", "22", "333", "4444", X,     // Group 0 
+      strings{{X, "1", "22", "333", "4444", X,     // Group 0
                X, "11", "12", "13", "14", "15", X, // Group 10
-               X},                                 // Group 20 
+               X},                                 // Group 20
               nulls_at({0, 5, 6, 12, 13})});
     auto const second_last_element = tester.test_grouped_nth_element(-2);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
       *second_last_element,
-      strings{{X, "1", "22", "333", "4444", X,     // Group 0 
+      strings{{X, "1", "22", "333", "4444", X,     // Group 0
                X, "11", "12", "13", "14", "15", X, // Group 10
-               X},                                 // Group 20 
+               X},                                 // Group 20
               nulls_at({0, 5, 6, 12, 13})});
     // clang-format on
   }
diff --git a/cpp/tests/scalar/factories_test.cpp b/cpp/tests/scalar/factories_test.cpp
index b531623d548..73cf3479ac2 100644
--- a/cpp/tests/scalar/factories_test.cpp
+++ b/cpp/tests/scalar/factories_test.cpp
@@ -28,7 +28,7 @@
 
 class ScalarFactoryTest : public cudf::test::BaseFixture {
  public:
-  rmm::cuda_stream_view stream() { return cudf::default_stream_value; }
+  rmm::cuda_stream_view stream() { return cudf::get_default_stream(); }
 };
 
 template <typename T>
diff --git a/cpp/tests/scalar/scalar_device_view_test.cu b/cpp/tests/scalar/scalar_device_view_test.cu
index f4a1c94c3e6..c7365d63e1c 100644
--- a/cpp/tests/scalar/scalar_device_view_test.cu
+++ b/cpp/tests/scalar/scalar_device_view_test.cu
@@ -57,20 +57,20 @@ TYPED_TEST(TypedScalarDeviceViewTest, Value)
 
   auto scalar_device_view  = cudf::get_scalar_device_view(s);
   auto scalar_device_view1 = cudf::get_scalar_device_view(s1);
-  rmm::device_scalar<bool> result{cudf::default_stream_value};
+  rmm::device_scalar<bool> result{cudf::get_default_stream()};
 
-  test_set_value<<<1, 1, 0, cudf::default_stream_value.value()>>>(scalar_device_view,
+  test_set_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view,
                                                                   scalar_device_view1);
   CUDF_CHECK_CUDA(0);
 
   EXPECT_EQ(s1.value(), value);
   EXPECT_TRUE(s1.is_valid());
 
-  test_value<<<1, 1, 0, cudf::default_stream_value.value()>>>(
+  test_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     scalar_device_view, scalar_device_view1, result.data());
   CUDF_CHECK_CUDA(0);
 
-  EXPECT_TRUE(result.value(cudf::default_stream_value));
+  EXPECT_TRUE(result.value(cudf::get_default_stream()));
 }
 
 template <typename ScalarDeviceViewType>
@@ -84,12 +84,12 @@ TYPED_TEST(TypedScalarDeviceViewTest, ConstructNull)
   TypeParam value = cudf::test::make_type_param_scalar<TypeParam>(5);
   cudf::scalar_type_t<TypeParam> s(value, false);
   auto scalar_device_view = cudf::get_scalar_device_view(s);
-  rmm::device_scalar<bool> result{cudf::default_stream_value};
+  rmm::device_scalar<bool> result{cudf::get_default_stream()};
 
-  test_null<<<1, 1, 0, cudf::default_stream_value.value()>>>(scalar_device_view, result.data());
+  test_null<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view, result.data());
   CUDF_CHECK_CUDA(0);
 
-  EXPECT_FALSE(result.value(cudf::default_stream_value));
+  EXPECT_FALSE(result.value(cudf::get_default_stream()));
 }
 
 template <typename ScalarDeviceViewType>
@@ -106,7 +106,7 @@ TYPED_TEST(TypedScalarDeviceViewTest, SetNull)
   s.set_valid_async(true);
   EXPECT_TRUE(s.is_valid());
 
-  test_setnull<<<1, 1, 0, cudf::default_stream_value.value()>>>(scalar_device_view);
+  test_setnull<<<1, 1, 0, cudf::get_default_stream().value()>>>(scalar_device_view);
   CUDF_CHECK_CUDA(0);
 
   EXPECT_FALSE(s.is_valid());
@@ -129,12 +129,12 @@ TEST_F(StringScalarDeviceViewTest, Value)
   cudf::string_scalar s(value);
 
   auto scalar_device_view = cudf::get_scalar_device_view(s);
-  rmm::device_scalar<bool> result{cudf::default_stream_value};
-  auto value_v = cudf::detail::make_device_uvector_sync(value);
+  rmm::device_scalar<bool> result{cudf::get_default_stream()};
+  auto value_v = cudf::detail::make_device_uvector_sync(value, cudf::get_default_stream());
 
-  test_string_value<<<1, 1, 0, cudf::default_stream_value.value()>>>(
+  test_string_value<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     scalar_device_view, value_v.data(), value.size(), result.data());
   CUDF_CHECK_CUDA(0);
 
-  EXPECT_TRUE(result.value(cudf::default_stream_value));
+  EXPECT_TRUE(result.value(cudf::get_default_stream()));
 }
diff --git a/cpp/tests/sort/segmented_sort_tests.cpp b/cpp/tests/sort/segmented_sort_tests.cpp
index fb07bfde795..ad905b6d04f 100644
--- a/cpp/tests/sort/segmented_sort_tests.cpp
+++ b/cpp/tests/sort/segmented_sort_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -80,14 +80,12 @@ TEST_F(SegmentedSortInt, Empty)
   CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(table_empty, table_empty, segments));
   CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(table_empty, table_empty, col_empty));
 
-  CUDF_EXPECT_THROW_MESSAGE(cudf::segmented_sort_by_key(table_empty, table_valid, segments),
-                            "Mismatch in number of rows for values and keys");
-  CUDF_EXPECT_THROW_MESSAGE(cudf::segmented_sort_by_key(table_empty, table_valid, col_empty),
-                            "Mismatch in number of rows for values and keys");
-  CUDF_EXPECT_THROW_MESSAGE(cudf::segmented_sort_by_key(table_valid, table_empty, segments),
-                            "Mismatch in number of rows for values and keys");
-  CUDF_EXPECT_THROW_MESSAGE(cudf::segmented_sort_by_key(table_valid, table_empty, col_empty),
-                            "Mismatch in number of rows for values and keys");
+  // Swapping "empty" and "valid" tables is invalid because the keys and values will be of different
+  // sizes.
+  EXPECT_THROW(cudf::segmented_sort_by_key(table_empty, table_valid, segments), cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_sort_by_key(table_empty, table_valid, col_empty), cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_sort_by_key(table_valid, table_empty, segments), cudf::logic_error);
+  EXPECT_THROW(cudf::segmented_sort_by_key(table_valid, table_empty, col_empty), cudf::logic_error);
 }
 
 TEST_F(SegmentedSortInt, Single)
@@ -99,7 +97,7 @@ TEST_F(SegmentedSortInt, Single)
   column_wrapper<int> segments2{{0, 3}};
   table_view table_1elem{{col1}};
   table_view table_1segm{{col3}};
-  CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(table_1elem, table_1elem, segments2));
+
   CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(table_1elem, table_1elem, segments1));
   CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(table_1segm, table_1segm, segments2));
   CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(table_1segm, table_1segm, segments1));
@@ -201,9 +199,13 @@ TEST_F(SegmentedSortInt, NonZeroSegmentsStart)
   column_wrapper<int> segments1{{0,    2,       5,       8,     11}};
   column_wrapper<int> segments2{{      2,       5,       8,      11}};
   column_wrapper<int> segments3{{                  6,    8,      11}};
+  column_wrapper<int> segments4{{                  6,    8}};
+  column_wrapper<int> segments5{{0,       3,       6}};
   column_wrapper<int> expected1{{0, 1, 2, 4, 3, 7, 5, 6, 9, 10, 8}};
   column_wrapper<int> expected2{{0, 1, 2, 4, 3, 7, 5, 6, 9, 10, 8}};
-  column_wrapper<int> expected3{{2, 4, 5, 3, 0, 1, 7, 6, 9, 10, 8}};
+  column_wrapper<int> expected3{{0, 1, 2, 3, 4, 5, 7, 6, 9, 10, 8}};
+  column_wrapper<int> expected4{{0, 1, 2, 3, 4, 5, 7, 6, 8, 9, 10}};
+  column_wrapper<int> expected5{{2, 0, 1, 4, 5, 3, 6, 7, 8, 9, 10}};
   // clang-format on
   table_view input{{col1}};
   auto results = cudf::detail::segmented_sorted_order(input, segments1);
@@ -212,6 +214,10 @@ TEST_F(SegmentedSortInt, NonZeroSegmentsStart)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected2);
   results = cudf::detail::segmented_sorted_order(input, segments3);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected3);
+  results = cudf::detail::segmented_sorted_order(input, segments4);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected4);
+  results = cudf::detail::segmented_sorted_order(input, segments5);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected5);
 }
 
 TEST_F(SegmentedSortInt, Sliced)
@@ -219,13 +225,13 @@ TEST_F(SegmentedSortInt, Sliced)
   using T = int;
   // clang-format off
   column_wrapper<T>        col1{{8, 9, 2, 3, 2, 2, 4, 1, 7, 5, 6}};
-  // sliced                      2, 2, 4, 1, 7, 5, 6
+  // sliced                                  2, 2, 4, 1, 7, 5, 6
   column_wrapper<int> segments1{{0,    2,       5}};
   column_wrapper<int> segments2{{-4,   0,      2,       5}};
   column_wrapper<int> segments3{{                 7}};
   column_wrapper<int> expected1{{0, 1, 3, 2, 4, 5, 6}};
   column_wrapper<int> expected2{{0, 1, 3, 2, 4, 5, 6}};
-  column_wrapper<int> expected3{{3, 0, 1, 2, 5, 6, 4}};
+  column_wrapper<int> expected3{{0, 1, 2, 3, 4, 5, 6}};
   // clang-format on
   auto slice = cudf::slice(col1, {4, 11})[0];  // 7 elements
   table_view input{{slice}};
@@ -245,27 +251,46 @@ TEST_F(SegmentedSortInt, Sliced)
 TEST_F(SegmentedSortInt, ErrorsMismatchArgSizes)
 {
   using T = int;
-  column_wrapper<T> col1{{1, 2, 3, 4}};
-  column_wrapper<T> col2{{5, 6, 7, 8, 9}};
+  column_wrapper<T> col1{{5, 6, 7, 8, 9}};
+  column_wrapper<T> segments{{1, 2, 3, 4}};
   table_view input1{{col1}};
 
   // Mismatch order sizes
   EXPECT_THROW(
-    cudf::segmented_sort_by_key(input1, input1, col2, {order::ASCENDING, order::ASCENDING}, {}),
+    cudf::segmented_sort_by_key(input1, input1, segments, {order::ASCENDING, order::ASCENDING}, {}),
     logic_error);
   // Mismatch null precedence sizes
-  EXPECT_THROW(
-    cudf::segmented_sort_by_key(input1, input1, col2, {}, {null_order::AFTER, null_order::AFTER}),
-    logic_error);
+  EXPECT_THROW(cudf::segmented_sort_by_key(
+                 input1, input1, segments, {}, {null_order::AFTER, null_order::AFTER}),
+               logic_error);
   // Both
   EXPECT_THROW(cudf::segmented_sort_by_key(input1,
                                            input1,
-                                           col2,
+                                           segments,
                                            {order::ASCENDING, order::ASCENDING},
                                            {null_order::AFTER, null_order::AFTER}),
                logic_error);
   // segmented_offsets beyond num_rows - undefined behavior, no throw.
-  CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(input1, input1, col2));
+  CUDF_EXPECT_NO_THROW(cudf::segmented_sort_by_key(input1, input1, segments));
+}
+
+TEST_F(SegmentedSortInt, Bool)
+{
+  cudf::test::fixed_width_column_wrapper<bool> col1{
+    {true,  false, false, true, true,  true,  true, true, true,  true, true,  true, true, false,
+     false, false, false, true, false, false, true, true, true,  true, true,  true, true, false,
+     true,  false, true,  true, true,  true,  true, true, false, true, false, false}};
+
+  cudf::test::fixed_width_column_wrapper<int> segments{{0, 5, 10, 15, 20, 25, 30, 40}};
+
+  auto test_col = cudf::column_view{col1};
+  auto result   = cudf::segmented_sorted_order(cudf::table_view({test_col}), segments);
+
+  cudf::test::fixed_width_column_wrapper<int> expected(
+    {1,  2,  0,  3,  4,  5,  6,  7,  8,  9,  13, 14, 10, 11, 12, 15, 16, 18, 19, 17,
+     20, 21, 22, 23, 24, 27, 29, 25, 26, 28, 36, 38, 39, 30, 31, 32, 33, 34, 35, 37});
+
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 }
 
 }  // namespace test
diff --git a/cpp/tests/sort/sort_test.cpp b/cpp/tests/sort/sort_test.cpp
index 4092597d8e3..82af21cd7af 100644
--- a/cpp/tests/sort/sort_test.cpp
+++ b/cpp/tests/sort/sort_test.cpp
@@ -32,22 +32,20 @@
 #include <type_traits>
 #include <vector>
 
-namespace cudf {
-namespace test {
-void run_sort_test(table_view input,
-                   column_view expected_sorted_indices,
-                   std::vector<order> column_order         = {},
-                   std::vector<null_order> null_precedence = {})
+void run_sort_test(cudf::table_view input,
+                   cudf::column_view expected_sorted_indices,
+                   std::vector<cudf::order> column_order         = {},
+                   std::vector<cudf::null_order> null_precedence = {})
 {
   // Sorted table
-  auto got_sorted_table      = sort(input, column_order, null_precedence);
-  auto expected_sorted_table = gather(input, expected_sorted_indices);
+  auto got_sorted_table      = cudf::sort(input, column_order, null_precedence);
+  auto expected_sorted_table = cudf::gather(input, expected_sorted_indices);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sorted_table->view(), got_sorted_table->view());
 
   // Sorted by key
-  auto got_sort_by_key_table      = sort_by_key(input, input, column_order, null_precedence);
-  auto expected_sort_by_key_table = gather(input, expected_sorted_indices);
+  auto got_sort_by_key_table      = cudf::sort_by_key(input, input, column_order, null_precedence);
+  auto expected_sort_by_key_table = cudf::gather(input, expected_sorted_indices);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort_by_key_table->view(), got_sort_by_key_table->view());
 }
@@ -56,7 +54,7 @@ using TestTypes = cudf::test::Concat<cudf::test::NumericTypes,  // include integ
                                      cudf::test::ChronoTypes>;  // include timestamps and durations
 
 template <typename T>
-struct Sort : public BaseFixture {
+struct Sort : public cudf::test::BaseFixture {
 };
 
 TYPED_TEST_SUITE(Sort, TestTypes);
@@ -65,17 +63,19 @@ TYPED_TEST(Sort, WithNullMax)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8, 5}, {1, 1, 0, 1, 1, 1}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k", "d"}, {1, 1, 0, 1, 1, 1});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2, 10}, {1, 1, 0, 1, 1, 1}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8, 5}, {1, 1, 0, 1, 1, 1}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k", "d"}, {1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2, 10}, {1, 1, 0, 1, 1, 1}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  fixed_width_column_wrapper<int32_t> expected{{1, 0, 5, 3, 4, 2}};
-  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::DESCENDING};
-  std::vector<null_order> null_precedence{null_order::AFTER, null_order::AFTER, null_order::AFTER};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{1, 0, 5, 3, 4, 2}};
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING};
+  std::vector<cudf::null_order> null_precedence{
+    cudf::null_order::AFTER, cudf::null_order::AFTER, cudf::null_order::AFTER};
 
   // Sorted order
-  auto got = sorted_order(input, column_order, null_precedence);
+  auto got = cudf::sorted_order(input, column_order, null_precedence);
 
   if (!std::is_same_v<T, bool>) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
@@ -85,7 +85,7 @@ TYPED_TEST(Sort, WithNullMax)
   } else {
     // for bools only validate that the null element landed at the back, since
     // the rest of the values are equivalent and yields random sorted order.
-    auto to_host = [](column_view const& col) {
+    auto to_host = [](cudf::column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
       CUDF_CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
@@ -96,7 +96,7 @@ TYPED_TEST(Sort, WithNullMax)
     EXPECT_EQ(h_exp[h_exp.size() - 1], h_got[h_got.size() - 1]);
 
     // Run test for sort and sort_by_key
-    fixed_width_column_wrapper<int32_t> expected_for_bool{{0, 3, 5, 1, 4, 2}};
+    cudf::test::fixed_width_column_wrapper<int32_t> expected_for_bool{{0, 3, 5, 1, 4, 2}};
     run_sort_test(input, expected_for_bool, column_order, null_precedence);
   }
 }
@@ -105,15 +105,16 @@ TYPED_TEST(Sort, WithNullMin)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}, {1, 1, 0, 1, 1}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"}, {1, 1, 0, 1, 1});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}, {1, 1, 0, 1, 1}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}, {1, 1, 0, 1, 1}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"}, {1, 1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}, {1, 1, 0, 1, 1}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4}};
-  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::DESCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4}};
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING};
 
-  auto got = sorted_order(input, column_order);
+  auto got = cudf::sorted_order(input, column_order);
 
   if (!std::is_same_v<T, bool>) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
@@ -123,7 +124,7 @@ TYPED_TEST(Sort, WithNullMin)
   } else {
     // for bools only validate that the null element landed at the front, since
     // the rest of the values are equivalent and yields random sorted order.
-    auto to_host = [](column_view const& col) {
+    auto to_host = [](cudf::column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
       CUDF_CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
@@ -134,7 +135,7 @@ TYPED_TEST(Sort, WithNullMin)
     EXPECT_EQ(h_exp.front(), h_got.front());
 
     // Run test for sort and sort_by_key
-    fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 0, 3, 1, 4}};
+    cudf::test::fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 0, 3, 1, 4}};
     run_sort_test(input, expected_for_bool, column_order);
   }
 }
@@ -143,23 +144,25 @@ TYPED_TEST(Sort, WithMixedNullOrder)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}, {0, 0, 1, 1, 0}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"}, {0, 1, 0, 0, 1});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}, {1, 0, 1, 0, 1}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}, {0, 0, 1, 1, 0}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"}, {0, 1, 0, 0, 1});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}, {1, 0, 1, 0, 1}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  fixed_width_column_wrapper<int32_t> expected{{2, 3, 0, 1, 4}};
-  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::ASCENDING};
-  std::vector<null_order> null_precedence{null_order::AFTER, null_order::BEFORE, null_order::AFTER};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{2, 3, 0, 1, 4}};
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::ASCENDING};
+  std::vector<cudf::null_order> null_precedence{
+    cudf::null_order::AFTER, cudf::null_order::BEFORE, cudf::null_order::AFTER};
 
-  auto got = sorted_order(input, column_order, null_precedence);
+  auto got = cudf::sorted_order(input, column_order, null_precedence);
 
   if (!std::is_same_v<T, bool>) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
   } else {
     // for bools only validate that the null element landed at the front, since
     // the rest of the values are equivalent and yields random sorted order.
-    auto to_host = [](column_view const& col) {
+    auto to_host = [](cudf::column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
       CUDF_CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
@@ -178,15 +181,16 @@ TYPED_TEST(Sort, WithAllValid)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4}};
-  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::DESCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4}};
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING};
 
-  auto got = sorted_order(input, column_order);
+  auto got = cudf::sorted_order(input, column_order);
 
   // Skip validating bools order. Valid true bools are all
   // equivalent, and yield random order after thrust::sort
@@ -197,7 +201,7 @@ TYPED_TEST(Sort, WithAllValid)
     run_sort_test(input, expected, column_order);
   } else {
     // Run test for sort and sort_by_key
-    fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 0, 3, 1, 4}};
+    cudf::test::fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 0, 3, 1, 4}};
     run_sort_test(input, expected_for_bool, column_order);
   }
 }
@@ -224,16 +228,18 @@ TYPED_TEST(Sort, WithStructColumn)
   auto struct_col_view{struct_col->view()};
   EXPECT_EQ(num_rows, struct_col->size());
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8, 9}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k", "a"});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2, 20}};
-  table_view input{{col1, col2, col3, struct_col_view}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8, 9}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k", "a"});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2, 20}};
+  cudf::table_view input{{col1, col2, col3, struct_col_view}};
 
-  fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4, 5}};
-  std::vector<order> column_order{
-    order::ASCENDING, order::ASCENDING, order::DESCENDING, order::ASCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4, 5}};
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING,
+                                        cudf::order::ASCENDING,
+                                        cudf::order::DESCENDING,
+                                        cudf::order::ASCENDING};
 
-  auto got = sorted_order(input, column_order);
+  auto got = cudf::sorted_order(input, column_order);
 
   // Skip validating bools order. Valid true bools are all
   // equivalent, and yield random order after thrust::sort
@@ -244,7 +250,7 @@ TYPED_TEST(Sort, WithStructColumn)
     run_sort_test(input, expected, column_order);
   } else {
     // Run test for sort and sort_by_key
-    fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 5, 3, 0, 1, 4}};
+    cudf::test::fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 5, 3, 0, 1, 4}};
     run_sort_test(input, expected_for_bool, column_order);
   }
 }
@@ -271,14 +277,15 @@ TYPED_TEST(Sort, WithNestedStructColumn)
 
   auto struct_col_view{struct_col2->view()};
 
-  fixed_width_column_wrapper<T> col1{{6, 6, 6, 6, 6, 6}};
-  fixed_width_column_wrapper<T> col2{{1, 1, 1, 2, 2, 2}};
-  table_view input{{col1, col2, struct_col_view}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{6, 6, 6, 6, 6, 6}};
+  cudf::test::fixed_width_column_wrapper<T> col2{{1, 1, 1, 2, 2, 2}};
+  cudf::table_view input{{col1, col2, struct_col_view}};
 
-  fixed_width_column_wrapper<int32_t> expected{{3, 5, 4, 2, 1, 0}};
-  std::vector<order> column_order{order::ASCENDING, order::DESCENDING, order::ASCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{3, 5, 4, 2, 1, 0}};
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::DESCENDING, cudf::order::ASCENDING};
 
-  auto got = sorted_order(input, column_order);
+  auto got = cudf::sorted_order(input, column_order);
 
   // Skip validating bools order. Valid true bools are all
   // equivalent, and yield random order after thrust::sort
@@ -289,7 +296,7 @@ TYPED_TEST(Sort, WithNestedStructColumn)
     run_sort_test(input, expected, column_order);
   } else {
     // Run test for sort and sort_by_key
-    fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 5, 1, 3, 4, 0}};
+    cudf::test::fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 5, 1, 3, 4, 0}};
     run_sort_test(input, expected_for_bool, column_order);
   }
 }
@@ -346,7 +353,7 @@ TYPED_TEST(Sort, WithNullableStructColumn)
     auto s1 = make_struct(std::move(s1_children), s1_mask);
 
     auto expect = fwcw{4, 5, 7, 3, 2, 0, 6, 1, 8};
-    run_sort_test(table_view({s1->view()}), expect);
+    run_sort_test(cudf::table_view({s1->view()}), expect);
   }
   { /*
         /+-------------+
@@ -384,7 +391,7 @@ TYPED_TEST(Sort, WithNullableStructColumn)
     auto s12 = make_struct(std::move(s12_children), s1_mask);
 
     auto expect = fwcw{4, 5, 7, 0, 6, 1, 2, 3, 8};
-    run_sort_test(table_view({s12->view()}), expect);
+    run_sort_test(cudf::table_view({s12->view()}), expect);
   }
 }
 
@@ -406,12 +413,12 @@ TYPED_TEST(Sort, WithSingleStructColumn)
   auto struct_col =
     cudf::test::structs_column_wrapper{{names_col, ages_col, is_human_col}, v}.release();
   auto struct_col_view{struct_col->view()};
-  table_view input{{struct_col_view}};
+  cudf::table_view input{{struct_col_view}};
 
-  fixed_width_column_wrapper<int32_t> expected{{2, 5, 1, 3, 4, 0}};
-  std::vector<order> column_order{order::ASCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{2, 5, 1, 3, 4, 0}};
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING};
 
-  auto got = sorted_order(input, column_order);
+  auto got = cudf::sorted_order(input, column_order);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
 
@@ -443,13 +450,13 @@ TYPED_TEST(Sort, WithSlicedStructColumn)
   auto col2 =                           FWCW{{    1,     1,     0,     0,    0,    2,   1,   3}};
   auto col3 =                           FWCW{{    7,     8,     1,     1,    9,    5,   7,   3}};
   auto col1 = cudf::test::strings_column_wrapper{names.begin(), names.end(), string_valids.begin()};
-  auto struct_col = structs_column_wrapper{{col1, col2, col3}}.release();
+  auto struct_col = cudf::test::structs_column_wrapper{{col1, col2, col3}}.release();
   // clang-format on
   auto struct_col_view{struct_col->view()};
-  table_view input{{struct_col_view}};
-  auto sliced_columns = cudf::split(struct_col_view, std::vector<size_type>{3});
-  auto sliced_tables  = cudf::split(input, std::vector<size_type>{3});
-  std::vector<order> column_order{order::ASCENDING};
+  cudf::table_view input{{struct_col_view}};
+  auto sliced_columns = cudf::split(struct_col_view, std::vector<cudf::size_type>{3});
+  auto sliced_tables  = cudf::split(input, std::vector<cudf::size_type>{3});
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING};
   /*
         asce_null_first   sliced[3:]
       /+-------------+
@@ -467,30 +474,30 @@ TYPED_TEST(Sort, WithSlicedStructColumn)
   */
 
   // normal
-  fixed_width_column_wrapper<int32_t> expected{{7, 2, 4, 3, 6, 0, 1, 5}};
-  auto got = sorted_order(input, column_order);
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{7, 2, 4, 3, 6, 0, 1, 5}};
+  auto got = cudf::sorted_order(input, column_order);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
   // Run test for sort and sort_by_key
   run_sort_test(input, expected, column_order);
 
   // table with sliced column
-  table_view input2{{sliced_columns[1]}};
-  fixed_width_column_wrapper<int32_t> expected2{{4, 1, 0, 3, 2}};
-  got = sorted_order(input2, column_order);
+  cudf::table_view input2{{sliced_columns[1]}};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected2{{4, 1, 0, 3, 2}};
+  got = cudf::sorted_order(input2, column_order);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
   // Run test for sort and sort_by_key
   run_sort_test(input2, expected2, column_order);
 
   // sliced table[1]
-  fixed_width_column_wrapper<int32_t> expected3{{4, 1, 0, 3, 2}};
-  got = sorted_order(sliced_tables[1], column_order);
+  cudf::test::fixed_width_column_wrapper<int32_t> expected3{{4, 1, 0, 3, 2}};
+  got = cudf::sorted_order(sliced_tables[1], column_order);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, got->view());
   // Run test for sort and sort_by_key
   run_sort_test(sliced_tables[1], expected3, column_order);
 
   // sliced table[0]
-  fixed_width_column_wrapper<int32_t> expected4{{2, 0, 1}};
-  got = sorted_order(sliced_tables[0], column_order);
+  cudf::test::fixed_width_column_wrapper<int32_t> expected4{{2, 0, 1}};
+  got = cudf::sorted_order(sliced_tables[0], column_order);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view());
   // Run test for sort and sort_by_key
   run_sort_test(sliced_tables[0], expected4, column_order);
@@ -507,25 +514,25 @@ TYPED_TEST(Sort, SlicedColumns)
   auto col2 =                           FWCW{{    7,     8,     1,     1,    9,    5,   7,   3}};
   auto col1 = cudf::test::strings_column_wrapper{names.begin(), names.end(), string_valids.begin()};
   // clang-format on
-  table_view input{{col1, col2}};
-  auto sliced_columns1 = cudf::split(col1, std::vector<size_type>{3});
-  auto sliced_columns2 = cudf::split(col1, std::vector<size_type>{3});
-  auto sliced_tables   = cudf::split(input, std::vector<size_type>{3});
-  std::vector<order> column_order{order::ASCENDING, order::ASCENDING};
+  cudf::table_view input{{col1, col2}};
+  auto sliced_columns1 = cudf::split(col1, std::vector<cudf::size_type>{3});
+  auto sliced_columns2 = cudf::split(col1, std::vector<cudf::size_type>{3});
+  auto sliced_tables   = cudf::split(input, std::vector<cudf::size_type>{3});
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING, cudf::order::ASCENDING};
 
   // normal
-  // fixed_width_column_wrapper<int32_t> expected{{2, 3, 7, 5, 0, 6, 1, 4}};
-  fixed_width_column_wrapper<int32_t> expected{{7, 2, 4, 3, 6, 0, 1, 5}};
-  auto got = sorted_order(input, column_order);
+  // cudf::test::fixed_width_column_wrapper<int32_t> expected{{2, 3, 7, 5, 0, 6, 1, 4}};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{7, 2, 4, 3, 6, 0, 1, 5}};
+  auto got = cudf::sorted_order(input, column_order);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
   // Run test for sort and sort_by_key
   run_sort_test(input, expected, column_order);
 
   // table with sliced column
-  table_view input2{{sliced_columns1[1], sliced_columns2[1]}};
-  // fixed_width_column_wrapper<int32_t> expected2{{0, 4, 2, 3, 1}};
-  fixed_width_column_wrapper<int32_t> expected2{{4, 1, 0, 3, 2}};
-  got = sorted_order(input2, column_order);
+  cudf::table_view input2{{sliced_columns1[1], sliced_columns2[1]}};
+  // cudf::test::fixed_width_column_wrapper<int32_t> expected2{{0, 4, 2, 3, 1}};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected2{{4, 1, 0, 3, 2}};
+  got = cudf::sorted_order(input2, column_order);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
   // Run test for sort and sort_by_key
   run_sort_test(input2, expected2, column_order);
@@ -572,37 +579,37 @@ TYPED_TEST(Sort, WithStructColumnCombinations)
   */
   // clang-format on
   auto struct_col_view{struct_col->view()};
-  table_view input{{struct_col_view}};
-  std::vector<order> column_order1{order::DESCENDING};
+  cudf::table_view input{{struct_col_view}};
+  std::vector<cudf::order> column_order1{cudf::order::DESCENDING};
 
   // desc_nulls_first
-  fixed_width_column_wrapper<int32_t> expected1{{2, 4, 3, 5, 6, 7, 1, 0}};
-  auto got = sorted_order(input, column_order1, {null_order::AFTER});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected1{{2, 4, 3, 5, 6, 7, 1, 0}};
+  auto got = cudf::sorted_order(input, column_order1, {cudf::null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected1, column_order1, {null_order::AFTER});
+  run_sort_test(input, expected1, column_order1, {cudf::null_order::AFTER});
 
   // desc_nulls_last
-  fixed_width_column_wrapper<int32_t> expected2{{1, 0, 6, 7, 3, 5, 2, 4}};
-  got = sorted_order(input, column_order1, {null_order::BEFORE});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected2{{1, 0, 6, 7, 3, 5, 2, 4}};
+  got = cudf::sorted_order(input, column_order1, {cudf::null_order::BEFORE});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected2, column_order1, {null_order::BEFORE});
+  run_sort_test(input, expected2, column_order1, {cudf::null_order::BEFORE});
 
   // asce_nulls_first
-  std::vector<order> column_order2{order::ASCENDING};
-  fixed_width_column_wrapper<int32_t> expected3{{2, 4, 3, 5, 7, 6, 0, 1}};
-  got = sorted_order(input, column_order2, {null_order::BEFORE});
+  std::vector<cudf::order> column_order2{cudf::order::ASCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected3{{2, 4, 3, 5, 7, 6, 0, 1}};
+  got = cudf::sorted_order(input, column_order2, {cudf::null_order::BEFORE});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected3, column_order2, {null_order::BEFORE});
+  run_sort_test(input, expected3, column_order2, {cudf::null_order::BEFORE});
 
   // asce_nulls_last
-  fixed_width_column_wrapper<int32_t> expected4{{0, 1, 7, 6, 3, 5, 2, 4}};
-  got = sorted_order(input, column_order2, {null_order::AFTER});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected4{{0, 1, 7, 6, 3, 5, 2, 4}};
+  got = cudf::sorted_order(input, column_order2, {cudf::null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected4, column_order2, {null_order::AFTER});
+  run_sort_test(input, expected4, column_order2, {cudf::null_order::AFTER});
 }
 
 TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls)
@@ -645,93 +652,94 @@ TYPED_TEST(Sort, WithStructColumnCombinationsWithoutNulls)
   */
   // clang-format on
   auto struct_col_view{struct_col->view()};
-  table_view input{{struct_col_view}};
-  std::vector<order> column_order{order::DESCENDING};
+  cudf::table_view input{{struct_col_view}};
+  std::vector<cudf::order> column_order{cudf::order::DESCENDING};
 
   // desc_nulls_first
   auto const expected1 = []() {
     if constexpr (std::is_same_v<T, bool>) {
-      return fixed_width_column_wrapper<int32_t>{{3, 5, 6, 7, 1, 2, 4, 0}};
+      return cudf::test::fixed_width_column_wrapper<int32_t>{{3, 5, 6, 7, 1, 2, 4, 0}};
     }
-    return fixed_width_column_wrapper<int32_t>{{3, 5, 6, 7, 2, 4, 1, 0}};
+    return cudf::test::fixed_width_column_wrapper<int32_t>{{3, 5, 6, 7, 2, 4, 1, 0}};
   }();
-  auto got = sorted_order(input, column_order, {null_order::AFTER});
+  auto got = cudf::sorted_order(input, column_order, {cudf::null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected1, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected1, column_order, {null_order::AFTER});
+  run_sort_test(input, expected1, column_order, {cudf::null_order::AFTER});
 
   // desc_nulls_last
-  fixed_width_column_wrapper<int32_t> expected2{{2, 4, 1, 0, 6, 7, 3, 5}};
-  got = sorted_order(input, column_order, {null_order::BEFORE});
+  cudf::test::fixed_width_column_wrapper<int32_t> expected2{{2, 4, 1, 0, 6, 7, 3, 5}};
+  got = cudf::sorted_order(input, column_order, {cudf::null_order::BEFORE});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected2, column_order, {null_order::BEFORE});
+  run_sort_test(input, expected2, column_order, {cudf::null_order::BEFORE});
 
   // asce_nulls_first
-  std::vector<order> column_order2{order::ASCENDING};
-  fixed_width_column_wrapper<int32_t> expected3{{3, 5, 7, 6, 0, 1, 2, 4}};
-  got = sorted_order(input, column_order2, {null_order::BEFORE});
+  std::vector<cudf::order> column_order2{cudf::order::ASCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected3{{3, 5, 7, 6, 0, 1, 2, 4}};
+  got = cudf::sorted_order(input, column_order2, {cudf::null_order::BEFORE});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected3, column_order2, {null_order::BEFORE});
+  run_sort_test(input, expected3, column_order2, {cudf::null_order::BEFORE});
 
   // asce_nulls_last
   auto const expected4 = []() {
     if constexpr (std::is_same_v<T, bool>) {
-      return fixed_width_column_wrapper<int32_t>{{0, 2, 4, 1, 7, 6, 3, 5}};
+      return cudf::test::fixed_width_column_wrapper<int32_t>{{0, 2, 4, 1, 7, 6, 3, 5}};
     }
-    return fixed_width_column_wrapper<int32_t>{{0, 1, 2, 4, 7, 6, 3, 5}};
+    return cudf::test::fixed_width_column_wrapper<int32_t>{{0, 1, 2, 4, 7, 6, 3, 5}};
   }();
-  got = sorted_order(input, column_order2, {null_order::AFTER});
+  got = cudf::sorted_order(input, column_order2, {cudf::null_order::AFTER});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected4, got->view());
   // Run test for sort and sort_by_key
-  run_sort_test(input, expected4, column_order2, {null_order::AFTER});
+  run_sort_test(input, expected4, column_order2, {cudf::null_order::AFTER});
 }
 
-TYPED_TEST(Sort, MisMatchInColumnOrderSize)
+TYPED_TEST(Sort, MismatchInColumnOrderSize)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  std::vector<order> column_order{order::ASCENDING, order::DESCENDING};
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING, cudf::order::DESCENDING};
 
-  EXPECT_THROW(sorted_order(input, column_order), logic_error);
-  EXPECT_THROW(sort(input, column_order), logic_error);
-  EXPECT_THROW(sort_by_key(input, input, column_order), logic_error);
+  EXPECT_THROW(cudf::sorted_order(input, column_order), cudf::logic_error);
+  EXPECT_THROW(cudf::sort(input, column_order), cudf::logic_error);
+  EXPECT_THROW(cudf::sort_by_key(input, input, column_order), cudf::logic_error);
 }
 
-TYPED_TEST(Sort, MisMatchInNullPrecedenceSize)
+TYPED_TEST(Sort, MismatchInNullPrecedenceSize)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  std::vector<order> column_order{order::ASCENDING, order::DESCENDING, order::DESCENDING};
-  std::vector<null_order> null_precedence{null_order::AFTER, null_order::BEFORE};
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::DESCENDING, cudf::order::DESCENDING};
+  std::vector<cudf::null_order> null_precedence{cudf::null_order::AFTER, cudf::null_order::BEFORE};
 
-  EXPECT_THROW(sorted_order(input, column_order, null_precedence), logic_error);
-  EXPECT_THROW(sort(input, column_order, null_precedence), logic_error);
-  EXPECT_THROW(sort_by_key(input, input, column_order, null_precedence), logic_error);
+  EXPECT_THROW(cudf::sorted_order(input, column_order, null_precedence), cudf::logic_error);
+  EXPECT_THROW(cudf::sort(input, column_order, null_precedence), cudf::logic_error);
+  EXPECT_THROW(cudf::sort_by_key(input, input, column_order, null_precedence), cudf::logic_error);
 }
 
 TYPED_TEST(Sort, ZeroSizedColumns)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{};
-  table_view input{{col1}};
+  cudf::test::fixed_width_column_wrapper<T> col1{};
+  cudf::table_view input{{col1}};
 
-  fixed_width_column_wrapper<int32_t> expected{};
-  std::vector<order> column_order{order::ASCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{};
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING};
 
-  auto got = sorted_order(input, column_order);
+  auto got = cudf::sorted_order(input, column_order);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
 
@@ -837,33 +845,31 @@ TYPED_TEST(Sort, WithEmptyListColumn)
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expect, *result);
 }
 
-struct SortByKey : public BaseFixture {
+struct SortByKey : public cudf::test::BaseFixture {
 };
 
 TEST_F(SortByKey, ValueKeysSizeMismatch)
 {
   using T = int64_t;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
-  table_view values{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
+  cudf::table_view values{{col1, col2, col3}};
 
-  fixed_width_column_wrapper<T> key_col{{5, 4, 3, 5}};
-  table_view keys{{key_col}};
+  cudf::test::fixed_width_column_wrapper<T> key_col{{5, 4, 3, 5}};
+  cudf::table_view keys{{key_col}};
 
-  EXPECT_THROW(sort_by_key(values, keys), logic_error);
+  EXPECT_THROW(cudf::sort_by_key(values, keys), cudf::logic_error);
 }
 
 template <typename T>
-struct FixedPointTestAllReps : public cudf::test::BaseFixture {
+struct SortFixedPointTest : public cudf::test::BaseFixture {
 };
 
-template <typename T>
-using wrapper = cudf::test::fixed_width_column_wrapper<T>;
-TYPED_TEST_SUITE(FixedPointTestAllReps, cudf::test::FixedPointTypes);
+TYPED_TEST_SUITE(SortFixedPointTest, cudf::test::FixedPointTypes);
 
-TYPED_TEST(FixedPointTestAllReps, FixedPointSortedOrderGather)
+TYPED_TEST(SortFixedPointTest, SortedOrderGather)
 {
   using namespace numeric;
   using decimalXX = TypeParam;
@@ -878,9 +884,12 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointSortedOrderGather)
   auto const index_vec  = std::vector<cudf::size_type>{2, 1, 0, 4, 3};
   auto const sorted_vec = std::vector<decimalXX>{ZERO, ONE, TWO, THREE, FOUR};
 
-  auto const input_col  = wrapper<decimalXX>(input_vec.begin(), input_vec.end());
-  auto const index_col  = wrapper<cudf::size_type>(index_vec.begin(), index_vec.end());
-  auto const sorted_col = wrapper<decimalXX>(sorted_vec.begin(), sorted_vec.end());
+  auto const input_col =
+    cudf::test::fixed_width_column_wrapper<decimalXX>(input_vec.begin(), input_vec.end());
+  auto const index_col =
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>(index_vec.begin(), index_vec.end());
+  auto const sorted_col =
+    cudf::test::fixed_width_column_wrapper<decimalXX>(sorted_vec.begin(), sorted_vec.end());
 
   auto const sorted_table = cudf::table_view{{sorted_col}};
   auto const input_table  = cudf::table_view{{input_col}};
@@ -892,25 +901,27 @@ TYPED_TEST(FixedPointTestAllReps, FixedPointSortedOrderGather)
   CUDF_TEST_EXPECT_TABLES_EQUAL(sorted_table, sorted->view());
 }
 
-struct SortCornerTest : public BaseFixture {
+struct SortCornerTest : public cudf::test::BaseFixture {
 };
 
 TEST_F(SortCornerTest, WithEmptyStructColumn)
 {
-  using int_col = fixed_width_column_wrapper<int32_t>;
+  using int_col = cudf::test::fixed_width_column_wrapper<int32_t>;
 
   // struct{}, int, int
   int_col col_for_mask{{0, 0, 0, 0, 0, 0}, {1, 0, 1, 1, 1, 1}};
-  auto null_mask  = cudf::copy_bitmask(col_for_mask.release()->view());
-  auto struct_col = cudf::make_structs_column(6, {}, UNKNOWN_NULL_COUNT, std::move(null_mask));
+  auto null_mask = cudf::copy_bitmask(col_for_mask.release()->view());
+  auto struct_col =
+    cudf::make_structs_column(6, {}, cudf::UNKNOWN_NULL_COUNT, std::move(null_mask));
 
   int_col col1{{1, 2, 3, 1, 2, 3}};
   int_col col2{{1, 1, 1, 2, 2, 2}};
-  table_view input{{struct_col->view(), col1, col2}};
+  cudf::table_view input{{struct_col->view(), col1, col2}};
 
   int_col expected{{1, 0, 3, 4, 2, 5}};
-  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::ASCENDING};
-  auto got = sorted_order(input, column_order);
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::ASCENDING};
+  auto got = cudf::sorted_order(input, column_order);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
 
   // struct{struct{}, int}
@@ -920,17 +931,18 @@ TEST_F(SortCornerTest, WithEmptyStructColumn)
   child_columns.push_back(col3.release());
   auto struct_col2 =
     cudf::make_structs_column(6, std::move(child_columns), 0, rmm::device_buffer{});
-  table_view input2{{struct_col2->view()}};
+  cudf::table_view input2{{struct_col2->view()}};
 
   int_col expected2{{5, 4, 3, 2, 0, 1}};
-  auto got2 = sorted_order(input2, {order::DESCENDING});
+  auto got2 = cudf::sorted_order(input2, {cudf::order::DESCENDING});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected2, got2->view());
 
   // struct{struct{}, struct{int}}
   int_col col_for_mask2{{0, 0, 0, 0, 0, 0}, {1, 0, 1, 1, 0, 1}};
   auto null_mask2 = cudf::copy_bitmask(col_for_mask2.release()->view());
   std::vector<std::unique_ptr<cudf::column>> child_columns2;
-  auto child_col_1 = cudf::make_structs_column(6, {}, UNKNOWN_NULL_COUNT, std::move(null_mask2));
+  auto child_col_1 =
+    cudf::make_structs_column(6, {}, cudf::UNKNOWN_NULL_COUNT, std::move(null_mask2));
   child_columns2.push_back(std::move(child_col_1));
   int_col col4{{5, 4, 3, 2, 1, 0}};
   std::vector<std::unique_ptr<cudf::column>> grand_child;
@@ -939,14 +951,26 @@ TEST_F(SortCornerTest, WithEmptyStructColumn)
   child_columns2.push_back(std::move(child_col_2));
   auto struct_col3 =
     cudf::make_structs_column(6, std::move(child_columns2), 0, rmm::device_buffer{});
-  table_view input3{{struct_col3->view()}};
+  cudf::table_view input3{{struct_col3->view()}};
 
   int_col expected3{{4, 1, 5, 3, 2, 0}};
-  auto got3 = sorted_order(input3, {order::ASCENDING});
+  auto got3 = cudf::sorted_order(input3, {cudf::order::ASCENDING});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected3, got3->view());
 };
 
-}  // namespace test
-}  // namespace cudf
+using SortDouble = Sort<double>;
+TEST_F(SortDouble, InfinityAndNan)
+{
+  auto constexpr NaN = std::numeric_limits<double>::quiet_NaN();
+  auto constexpr Inf = std::numeric_limits<double>::infinity();
+
+  auto input = cudf::test::fixed_width_column_wrapper<double>(
+    {-0.0, -NaN, -NaN, NaN, Inf, -Inf, 7.0, 5.0, 6.0, NaN, Inf, -Inf, -NaN, -NaN, -0.0});
+  auto expected =  // -inf,-inf,-0,-0,5,6,7,inf,inf,-nan,-nan,nan,nan,-nan,-nan
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>(
+      {5, 11, 0, 14, 7, 8, 6, 4, 10, 1, 2, 3, 9, 12, 13});
+  auto results = cudf::sorted_order(cudf::table_view({input}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+}
 
 CUDF_TEST_PROGRAM_MAIN()
diff --git a/cpp/tests/sort/stable_sort_tests.cpp b/cpp/tests/sort/stable_sort_tests.cpp
index ee43c9e7b4b..57ad6361ad6 100644
--- a/cpp/tests/sort/stable_sort_tests.cpp
+++ b/cpp/tests/sort/stable_sort_tests.cpp
@@ -31,15 +31,13 @@
 #include <type_traits>
 #include <vector>
 
-namespace cudf {
-namespace test {
-void run_stable_sort_test(table_view input,
-                          column_view expected_sorted_indices,
-                          std::vector<order> column_order         = {},
-                          std::vector<null_order> null_precedence = {})
+void run_stable_sort_test(cudf::table_view input,
+                          cudf::column_view expected_sorted_indices,
+                          std::vector<cudf::order> column_order         = {},
+                          std::vector<cudf::null_order> null_precedence = {})
 {
-  auto got_sort_by_key_table      = sort_by_key(input, input, column_order, null_precedence);
-  auto expected_sort_by_key_table = gather(input, expected_sorted_indices);
+  auto got_sort_by_key_table      = cudf::sort_by_key(input, input, column_order, null_precedence);
+  auto expected_sort_by_key_table = cudf::gather(input, expected_sorted_indices);
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected_sort_by_key_table->view(), got_sort_by_key_table->view());
 }
@@ -48,7 +46,7 @@ using TestTypes = cudf::test::Concat<cudf::test::NumericTypes,  // include integ
                                      cudf::test::ChronoTypes>;  // include timestamps and durations
 
 template <typename T>
-struct StableSort : public BaseFixture {
+struct StableSort : public cudf::test::BaseFixture {
 };
 
 TYPED_TEST_SUITE(StableSort, TestTypes);
@@ -58,14 +56,16 @@ TYPED_TEST(StableSort, MixedNullOrder)
   using T = TypeParam;
   using R = int32_t;
 
-  fixed_width_column_wrapper<T> col1({0, 1, 1, 0, 0, 1, 0, 1}, {0, 1, 1, 1, 1, 1, 1, 1});
-  strings_column_wrapper col2({"2", "a", "b", "x", "k", "a", "x", "a"}, {1, 1, 1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T> col1({0, 1, 1, 0, 0, 1, 0, 1},
+                                                 {0, 1, 1, 1, 1, 1, 1, 1});
+  cudf::test::strings_column_wrapper col2({"2", "a", "b", "x", "k", "a", "x", "a"},
+                                          {1, 1, 1, 1, 0, 1, 1, 1});
 
-  fixed_width_column_wrapper<R> expected{{4, 3, 6, 1, 5, 7, 2, 0}};
+  cudf::test::fixed_width_column_wrapper<R> expected{{4, 3, 6, 1, 5, 7, 2, 0}};
 
-  auto got = stable_sorted_order(table_view({col1, col2}),
-                                 {order::ASCENDING, order::ASCENDING},
-                                 {null_order::AFTER, null_order::BEFORE});
+  auto got = cudf::stable_sorted_order(cudf::table_view({col1, col2}),
+                                       {cudf::order::ASCENDING, cudf::order::ASCENDING},
+                                       {cudf::null_order::AFTER, cudf::null_order::BEFORE});
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
 }
@@ -74,16 +74,18 @@ TYPED_TEST(StableSort, WithNullMax)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8, 5}, {1, 1, 0, 1, 1, 1}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k", "d"}, {1, 1, 0, 1, 1, 1});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2, 10}, {1, 1, 0, 1, 1, 1}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8, 5}, {1, 1, 0, 1, 1, 1}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k", "d"}, {1, 1, 0, 1, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2, 10}, {1, 1, 0, 1, 1, 1}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  fixed_width_column_wrapper<int32_t> expected{{1, 0, 3, 5, 4, 2}};
-  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::DESCENDING};
-  std::vector<null_order> null_precedence{null_order::AFTER, null_order::AFTER, null_order::AFTER};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{1, 0, 3, 5, 4, 2}};
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING};
+  std::vector<cudf::null_order> null_precedence{
+    cudf::null_order::AFTER, cudf::null_order::AFTER, cudf::null_order::AFTER};
 
-  auto got = stable_sorted_order(input, column_order, null_precedence);
+  auto got = cudf::stable_sorted_order(input, column_order, null_precedence);
 
   if (not std::is_same_v<T, bool>) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
@@ -92,7 +94,7 @@ TYPED_TEST(StableSort, WithNullMax)
   } else {
     // for bools only validate that the null element landed at the back, since
     // the rest of the values are equivalent and yields random sorted order.
-    auto to_host = [](column_view const& col) {
+    auto to_host = [](cudf::column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
       CUDF_CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
@@ -102,7 +104,7 @@ TYPED_TEST(StableSort, WithNullMax)
     thrust::host_vector<int32_t> h_got = to_host(got->view());
     EXPECT_EQ(h_exp[h_exp.size() - 1], h_got[h_got.size() - 1]);
 
-    fixed_width_column_wrapper<int32_t> expected_for_bool{{0, 3, 5, 1, 4, 2}};
+    cudf::test::fixed_width_column_wrapper<int32_t> expected_for_bool{{0, 3, 5, 1, 4, 2}};
     run_stable_sort_test(input, expected_for_bool, column_order, null_precedence);
   }
 }
@@ -111,15 +113,16 @@ TYPED_TEST(StableSort, WithNullMin)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}, {1, 1, 0, 1, 1}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"}, {1, 1, 0, 1, 1});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2}, {1, 1, 0, 1, 1}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}, {1, 1, 0, 1, 1}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"}, {1, 1, 0, 1, 1});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2}, {1, 1, 0, 1, 1}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4}};
-  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::DESCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4}};
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING};
 
-  auto got = stable_sorted_order(input, column_order);
+  auto got = cudf::stable_sorted_order(input, column_order);
 
   if (!std::is_same_v<T, bool>) {
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
@@ -128,7 +131,7 @@ TYPED_TEST(StableSort, WithNullMin)
   } else {
     // for bools only validate that the null element landed at the front, since
     // the rest of the values are equivalent and yields random sorted order.
-    auto to_host = [](column_view const& col) {
+    auto to_host = [](cudf::column_view const& col) {
       thrust::host_vector<int32_t> h_data(col.size());
       CUDF_CUDA_TRY(cudaMemcpy(
         h_data.data(), col.data<int32_t>(), h_data.size() * sizeof(int32_t), cudaMemcpyDefault));
@@ -138,7 +141,7 @@ TYPED_TEST(StableSort, WithNullMin)
     thrust::host_vector<int32_t> h_got = to_host(got->view());
     EXPECT_EQ(h_exp.front(), h_got.front());
 
-    fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 0, 3, 1, 4}};
+    cudf::test::fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 0, 3, 1, 4}};
     run_stable_sort_test(input, expected_for_bool, column_order);
   }
 }
@@ -147,15 +150,16 @@ TYPED_TEST(StableSort, WithAllValid)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 10, 2}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4}};
-  std::vector<order> column_order{order::ASCENDING, order::ASCENDING, order::DESCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{{2, 1, 0, 3, 4}};
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::ASCENDING, cudf::order::DESCENDING};
 
-  auto got = stable_sorted_order(input, column_order);
+  auto got = cudf::stable_sorted_order(input, column_order);
 
   // Skip validating bools order. Valid true bools are all
   // equivalent, and yield random order after thrust::sort
@@ -164,7 +168,7 @@ TYPED_TEST(StableSort, WithAllValid)
 
     run_stable_sort_test(input, expected, column_order);
   } else {
-    fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 0, 3, 1, 4}};
+    cudf::test::fixed_width_column_wrapper<int32_t> expected_for_bool{{2, 0, 3, 1, 4}};
     run_stable_sort_test(input, expected_for_bool, column_order);
   }
 }
@@ -173,66 +177,68 @@ TYPED_TEST(StableSort, MisMatchInColumnOrderSize)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  std::vector<order> column_order{order::ASCENDING, order::DESCENDING};
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING, cudf::order::DESCENDING};
 
-  EXPECT_THROW(stable_sorted_order(input, column_order), logic_error);
-  EXPECT_THROW(stable_sort_by_key(input, input, column_order), logic_error);
+  EXPECT_THROW(cudf::stable_sorted_order(input, column_order), cudf::logic_error);
+  EXPECT_THROW(cudf::stable_sort_by_key(input, input, column_order), cudf::logic_error);
 }
 
 TYPED_TEST(StableSort, MisMatchInNullPrecedenceSize)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
-  table_view input{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
+  cudf::table_view input{{col1, col2, col3}};
 
-  std::vector<order> column_order{order::ASCENDING, order::DESCENDING, order::DESCENDING};
-  std::vector<null_order> null_precedence{null_order::AFTER, null_order::BEFORE};
+  std::vector<cudf::order> column_order{
+    cudf::order::ASCENDING, cudf::order::DESCENDING, cudf::order::DESCENDING};
+  std::vector<cudf::null_order> null_precedence{cudf::null_order::AFTER, cudf::null_order::BEFORE};
 
-  EXPECT_THROW(stable_sorted_order(input, column_order, null_precedence), logic_error);
-  EXPECT_THROW(stable_sort_by_key(input, input, column_order, null_precedence), logic_error);
+  EXPECT_THROW(cudf::stable_sorted_order(input, column_order, null_precedence), cudf::logic_error);
+  EXPECT_THROW(cudf::stable_sort_by_key(input, input, column_order, null_precedence),
+               cudf::logic_error);
 }
 
 TYPED_TEST(StableSort, ZeroSizedColumns)
 {
   using T = TypeParam;
 
-  fixed_width_column_wrapper<T> col1{};
-  table_view input{{col1}};
+  cudf::test::fixed_width_column_wrapper<T> col1{};
+  cudf::table_view input{{col1}};
 
-  fixed_width_column_wrapper<int32_t> expected{};
-  std::vector<order> column_order{order::ASCENDING};
+  cudf::test::fixed_width_column_wrapper<int32_t> expected{};
+  std::vector<cudf::order> column_order{cudf::order::ASCENDING};
 
-  auto got = stable_sorted_order(input, column_order);
+  auto got = cudf::stable_sorted_order(input, column_order);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(expected, got->view());
 
   run_stable_sort_test(input, expected, column_order);
 }
 
-struct StableSortByKey : public BaseFixture {
+struct StableSortByKey : public cudf::test::BaseFixture {
 };
 
 TEST_F(StableSortByKey, ValueKeysSizeMismatch)
 {
   using T = int64_t;
 
-  fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
-  strings_column_wrapper col2({"d", "e", "a", "d", "k"});
-  fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
-  table_view values{{col1, col2, col3}};
+  cudf::test::fixed_width_column_wrapper<T> col1{{5, 4, 3, 5, 8}};
+  cudf::test::strings_column_wrapper col2({"d", "e", "a", "d", "k"});
+  cudf::test::fixed_width_column_wrapper<T> col3{{10, 40, 70, 5, 2}};
+  cudf::table_view values{{col1, col2, col3}};
 
-  fixed_width_column_wrapper<T> key_col{{5, 4, 3, 5}};
-  table_view keys{{key_col}};
+  cudf::test::fixed_width_column_wrapper<T> key_col{{5, 4, 3, 5}};
+  cudf::table_view keys{{key_col}};
 
-  EXPECT_THROW(stable_sort_by_key(values, keys), logic_error);
+  EXPECT_THROW(cudf::stable_sort_by_key(values, keys), cudf::logic_error);
 }
 
 template <typename T>
@@ -272,5 +278,17 @@ TYPED_TEST(StableSortFixedPoint, FixedPointSortedOrderGather)
   CUDF_TEST_EXPECT_TABLES_EQUAL(sorted_table, sorted->view());
 }
 
-}  // namespace test
-}  // namespace cudf
+using StableSortDouble = StableSort<double>;
+TEST_F(StableSortDouble, InfinityAndNaN)
+{
+  auto constexpr NaN = std::numeric_limits<double>::quiet_NaN();
+  auto constexpr Inf = std::numeric_limits<double>::infinity();
+
+  auto input = cudf::test::fixed_width_column_wrapper<double>(
+    {-0.0, -NaN, -NaN, NaN, Inf, -Inf, 7.0, 5.0, 6.0, NaN, Inf, -Inf, -NaN, -NaN, -0.0});
+  auto expected =  // -inf,-inf,-0,-0,5,6,7,inf,inf,-nan,-nan,nan,nan,-nan,-nan
+    cudf::test::fixed_width_column_wrapper<cudf::size_type>(
+      {5, 11, 0, 14, 7, 8, 6, 4, 10, 1, 2, 3, 9, 12, 13});
+  auto results = stable_sorted_order(cudf::table_view({input}));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+}
diff --git a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
index 99d5c90d1a4..2f8bfa847fa 100644
--- a/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
+++ b/cpp/tests/stream_compaction/apply_boolean_mask_tests.cpp
@@ -273,7 +273,7 @@ TEST_F(ApplyBooleanMask, CorrectNullCount)
   auto got     = cudf::apply_boolean_mask(input, boolean_mask);
   auto out_col = got->get_column(0).view();
   auto expected_null_count =
-    cudf::detail::null_count(out_col.null_mask(), 0, out_col.size(), cudf::default_stream_value);
+    cudf::detail::null_count(out_col.null_mask(), 0, out_col.size(), cudf::get_default_stream());
 
   ASSERT_EQ(out_col.null_count(), expected_null_count);
 }
diff --git a/cpp/tests/strings/array_tests.cpp b/cpp/tests/strings/array_tests.cpp
index 10cc4562be7..11f5c9f39aa 100644
--- a/cpp/tests/strings/array_tests.cpp
+++ b/cpp/tests/strings/array_tests.cpp
@@ -14,16 +14,14 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
 
 #include <cudf/copying.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/sorting.hpp>
-#include <cudf/strings/detail/copying.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -53,7 +51,7 @@ TEST_F(StringsColumnTest, SortZeroSizeStringsColumn)
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
   auto results = cudf::sort(cudf::table_view({zero_size_strings_column}));
-  cudf::test::expect_strings_empty(results->view().column(0));
+  cudf::test::expect_column_empty(results->view().column(0));
 }
 
 class SliceParmsTest : public StringsColumnTest,
@@ -63,14 +61,14 @@ class SliceParmsTest : public StringsColumnTest,
 TEST_P(SliceParmsTest, Slice)
 {
   std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  cudf::test::strings_column_wrapper input(
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
   cudf::size_type start = 3;
   cudf::size_type end   = GetParam();
-  auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);
+
+  auto scol    = cudf::slice(input, {start, end});
+  auto results = std::make_unique<cudf::column>(scol.front());
 
   cudf::test::strings_column_wrapper expected(
     h_strings.begin() + start,
@@ -83,14 +81,14 @@ TEST_P(SliceParmsTest, Slice)
 TEST_P(SliceParmsTest, SliceAllNulls)
 {
   std::vector<const char*> h_strings{nullptr, nullptr, nullptr, nullptr, nullptr, nullptr, nullptr};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+  cudf::test::strings_column_wrapper input(
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
   cudf::size_type start = 3;
   cudf::size_type end   = GetParam();
-  auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);
+
+  auto scol    = cudf::slice(input, {start, end});
+  auto results = std::make_unique<cudf::column>(scol.front());
 
   cudf::test::strings_column_wrapper expected(
     h_strings.begin() + start,
@@ -103,11 +101,13 @@ TEST_P(SliceParmsTest, SliceAllNulls)
 TEST_P(SliceParmsTest, SliceAllEmpty)
 {
   std::vector<const char*> h_strings{"", "", "", "", "", "", ""};
-  cudf::test::strings_column_wrapper strings(h_strings.begin(), h_strings.end());
+  cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end());
 
   cudf::size_type start = 3;
   cudf::size_type end   = GetParam();
-  auto results = cudf::strings::detail::copy_slice(cudf::strings_column_view(strings), start, end);
+
+  auto scol    = cudf::slice(input, {start, end});
+  auto results = std::make_unique<cudf::column>(scol.front());
 
   cudf::test::strings_column_wrapper expected(h_strings.begin() + start, h_strings.begin() + end);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -121,27 +121,23 @@ TEST_F(StringsColumnTest, SliceZeroSizeStringsColumn)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto strings_view = cudf::strings_column_view(zero_size_strings_column);
-  auto results      = cudf::strings::detail::copy_slice(strings_view, 1, 2);
-  cudf::test::expect_strings_empty(results->view());
+  auto scol    = cudf::slice(zero_size_strings_column, {0, 0});
+  auto results = std::make_unique<cudf::column>(scol.front());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsColumnTest, Gather)
 {
   std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
   cudf::test::fixed_width_column_wrapper<int32_t> gather_map{{4, 1}};
   auto results = cudf::gather(cudf::table_view{{strings}}, gather_map)->release();
 
   std::vector<const char*> h_expected{"aa", "bb"};
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(results.front()->view(), expected);
 }
 
@@ -151,7 +147,7 @@ TEST_F(StringsColumnTest, GatherZeroSizeStringsColumn)
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
   cudf::column_view map_view(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
   auto results = cudf::gather(cudf::table_view{{zero_size_strings_column}}, map_view)->release();
-  cudf::test::expect_strings_empty(results.front()->view());
+  cudf::test::expect_column_empty(results.front()->view());
 }
 
 TEST_F(StringsColumnTest, GatherTooBig)
@@ -204,12 +200,12 @@ TEST_F(StringsColumnTest, ScatterZeroSizeStringsColumn)
   cudf::column_view scatter_map(cudf::data_type{cudf::type_id::INT8}, 0, nullptr, nullptr, 0);
 
   auto results = cudf::scatter(cudf::table_view({source}), scatter_map, cudf::table_view({target}));
-  cudf::test::expect_strings_empty(results->view().column(0));
+  cudf::test::expect_column_empty(results->view().column(0));
 
   cudf::string_scalar scalar("");
   auto scalar_source = std::vector<std::reference_wrapper<const cudf::scalar>>({scalar});
   results            = cudf::scatter(scalar_source, scatter_map, cudf::table_view({target}));
-  cudf::test::expect_strings_empty(results->view().column(0));
+  cudf::test::expect_column_empty(results->view().column(0));
 }
 
 TEST_F(StringsColumnTest, OffsetsBeginEnd)
diff --git a/cpp/tests/strings/booleans_tests.cpp b/cpp/tests/strings/booleans_tests.cpp
index cc637bf55a0..b8e47a89274 100644
--- a/cpp/tests/strings/booleans_tests.cpp
+++ b/cpp/tests/strings/booleans_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/strings/convert/convert_booleans.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/strings/convert/convert_booleans.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -69,7 +69,7 @@ TEST_F(StringsConvertTest, ZeroSizeStringsColumnBoolean)
 {
   cudf::column_view zero_size_column(cudf::data_type{cudf::type_id::BOOL8}, 0, nullptr, nullptr, 0);
   auto results = cudf::strings::from_booleans(zero_size_column);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConvertTest, ZeroSizeBooleansColumn)
diff --git a/cpp/tests/strings/case_tests.cpp b/cpp/tests/strings/case_tests.cpp
index c399c640bb6..26b44b577eb 100644
--- a/cpp/tests/strings/case_tests.cpp
+++ b/cpp/tests/strings/case_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/strings/capitalize.hpp>
 #include <cudf/strings/case.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
-
 #include <thrust/iterator/transform_iterator.h>
 
 #include <vector>
@@ -211,19 +210,19 @@ TEST_F(StringsCaseTest, EmptyStringsColumn)
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
 
   auto results = cudf::strings::to_lower(strings_view);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 
   results = cudf::strings::to_upper(strings_view);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 
   results = cudf::strings::swapcase(strings_view);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 
   results = cudf::strings::capitalize(strings_view);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 
   results = cudf::strings::title(strings_view);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsCaseTest, ErrorTest)
diff --git a/cpp/tests/strings/combine/concatenate_tests.cpp b/cpp/tests/strings/combine/concatenate_tests.cpp
index 569767531bc..0b744cd6bb4 100644
--- a/cpp/tests/strings/combine/concatenate_tests.cpp
+++ b/cpp/tests/strings/combine/concatenate_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
@@ -21,11 +25,6 @@
 #include <cudf/table/table.hpp>
 #include <cudf/types.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
-
 #include <thrust/iterator/transform_iterator.h>
 
 constexpr cudf::test::debug_output_level verbosity{cudf::test::debug_output_level::ALL_ERRORS};
@@ -158,7 +157,7 @@ TEST_F(StringsCombineTest, ConcatZeroSizeStringsColumns)
   strings_columns.push_back(zero_size_strings_column);
   cudf::table_view table(strings_columns);
   auto results = cudf::strings::concatenate(table);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsCombineTest, SingleColumnErrorCheck)
@@ -207,7 +206,7 @@ TEST_F(StringsConcatenateWithColSeparatorTest, ZeroSizedColumns)
 
   auto results =
     cudf::strings::concatenate(cudf::table_view{{col0}}, cudf::strings_column_view(col0));
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConcatenateWithColSeparatorTest, SingleColumnEmptyAndNullStringsNoReplacements)
diff --git a/cpp/tests/strings/combine/join_strings_tests.cpp b/cpp/tests/strings/combine/join_strings_tests.cpp
index e018540e84c..e0187ce2e26 100644
--- a/cpp/tests/strings/combine/join_strings_tests.cpp
+++ b/cpp/tests/strings/combine/join_strings_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,17 +14,16 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/combine.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/types.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
-
 #include <thrust/iterator/transform_iterator.h>
 
 struct JoinStringsTest : public cudf::test::BaseFixture {
@@ -66,7 +65,7 @@ TEST_F(JoinStringsTest, JoinZeroSizeStringsColumn)
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::join_strings(strings_view);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(JoinStringsTest, JoinAllNullStringsColumn)
diff --git a/cpp/tests/strings/concatenate_tests.cpp b/cpp/tests/strings/concatenate_tests.cpp
index 0318fc3edb9..e4f2f7ca62c 100644
--- a/cpp/tests/strings/concatenate_tests.cpp
+++ b/cpp/tests/strings/concatenate_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/strings/detail/concatenate.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/column/column_factories.hpp>
+#include <cudf/concatenate.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <vector>
 
@@ -61,7 +60,7 @@ TEST_F(StringsConcatenateTest, Concatenate)
   strings_columns.push_back(strings2);
   strings_columns.push_back(strings3);
 
-  auto results = cudf::strings::detail::concatenate(strings_columns);
+  auto results = cudf::concatenate(strings_columns);
 
   cudf::test::strings_column_wrapper expected(h_strings.begin(), h_strings.end());
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
@@ -75,8 +74,8 @@ TEST_F(StringsConcatenateTest, ZeroSizeStringsColumns)
   strings_columns.push_back(zero_size_strings_column);
   strings_columns.push_back(zero_size_strings_column);
   strings_columns.push_back(zero_size_strings_column);
-  auto results = cudf::strings::detail::concatenate(strings_columns);
-  cudf::test::expect_strings_empty(results->view());
+  auto results = cudf::concatenate(strings_columns);
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConcatenateTest, ZeroSizeStringsPlusNormal)
@@ -108,6 +107,6 @@ TEST_F(StringsConcatenateTest, ZeroSizeStringsPlusNormal)
                                               h_strings.data() + h_strings.size());
   strings_columns.push_back(strings1);
 
-  auto results = cudf::strings::detail::concatenate(strings_columns);
+  auto results = cudf::concatenate(strings_columns);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings1);
 }
diff --git a/cpp/tests/strings/contains_tests.cpp b/cpp/tests/strings/contains_tests.cpp
index ba738f7b616..43ef73baf14 100644
--- a/cpp/tests/strings/contains_tests.cpp
+++ b/cpp/tests/strings/contains_tests.cpp
@@ -14,13 +14,15 @@
  * limitations under the License.
  */
 
-#include <cudf/detail/utilities/vector_factories.hpp>
-#include <cudf/strings/contains.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/contains.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/host_vector.h>
 #include <thrust/iterator/counting_iterator.h>
@@ -147,6 +149,9 @@ TEST_F(StringsContainsTests, ContainsTest)
       h_expected + h_strings.size(),
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(ptn);
+    results   = cudf::strings::contains_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
@@ -161,40 +166,56 @@ TEST_F(StringsContainsTests, MatchesTest)
 
   auto strings_view = cudf::strings_column_view(strings);
   {
-    auto results      = cudf::strings::matches_re(strings_view, "lazy");
-    bool h_expected[] = {false, false, true, false, false, false, false};
+    auto const pattern = std::string("lazy");
+    auto results       = cudf::strings::matches_re(strings_view, pattern);
+    bool h_expected[]  = {false, false, true, false, false, false, false};
     cudf::test::fixed_width_column_wrapper<bool> expected(
       h_expected,
       h_expected + h_strings.size(),
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results      = cudf::strings::matches_re(strings_view, "\\d+");
-    bool h_expected[] = {false, false, false, true, true, false, false};
+    auto const pattern = std::string("\\d+");
+    auto results       = cudf::strings::matches_re(strings_view, pattern);
+    bool h_expected[]  = {false, false, false, true, true, false, false};
     cudf::test::fixed_width_column_wrapper<bool> expected(
       h_expected,
       h_expected + h_strings.size(),
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results      = cudf::strings::matches_re(strings_view, "@\\w+");
-    bool h_expected[] = {false, false, false, false, false, false, false};
+    auto const pattern = std::string("@\\w+");
+    auto results       = cudf::strings::matches_re(strings_view, pattern);
+    bool h_expected[]  = {false, false, false, false, false, false, false};
     cudf::test::fixed_width_column_wrapper<bool> expected(
       h_expected,
       h_expected + h_strings.size(),
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results      = cudf::strings::matches_re(strings_view, ".*");
-    bool h_expected[] = {true, true, true, true, true, false, true};
+    auto const pattern = std::string(".*");
+    auto results       = cudf::strings::matches_re(strings_view, pattern);
+    bool h_expected[]  = {true, true, true, true, true, false, true};
     cudf::test::fixed_width_column_wrapper<bool> expected(
       h_expected,
       h_expected + h_strings.size(),
       thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
@@ -219,6 +240,9 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
     cudf::test::fixed_width_column_wrapper<bool> expected(
       {true, true, false, false, false, false, true, true, true, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
   }
   {  // is_loopback: 72 instructions
     std::string pattern =
@@ -229,6 +253,9 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
     cudf::test::fixed_width_column_wrapper<bool> expected(
       {false, false, false, false, false, false, false, false, false, true});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
   }
   {  // is_multicast: 79 instructions
     std::string pattern =
@@ -239,6 +266,9 @@ TEST_F(StringsContainsTests, MatchesIPV4Test)
     cudf::test::fixed_width_column_wrapper<bool> expected(
       {false, false, false, false, false, false, true, true, false, false});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::matches_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
   }
 }
 
@@ -247,18 +277,43 @@ TEST_F(StringsContainsTests, OctalTest)
   cudf::test::strings_column_wrapper strings({"A3", "B", "CDA3EY", "", "99", "\a\t\r"});
   auto strings_view = cudf::strings_column_view(strings);
   auto expected     = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 0, 0});
-  auto results      = cudf::strings::contains_re(strings_view, "\\101");
+
+  auto pattern = std::string("\\101");
+  auto results = cudf::strings::contains_re(strings_view, pattern);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::contains_re(strings_view, *prog);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results = cudf::strings::contains_re(strings_view, "\\1013");
+
+  pattern = std::string("\\1013");
+  results = cudf::strings::contains_re(strings_view, pattern);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  pattern = std::string("D*\\101\\063");
+  results = cudf::strings::contains_re(strings_view, pattern);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results = cudf::strings::contains_re(strings_view, "D*\\101\\063");
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results  = cudf::strings::contains_re(strings_view, "\\719");
+
+  pattern  = std::string("\\719");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-  results  = cudf::strings::contains_re(strings_view, "[\\7][\\11][\\15]");
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
+  pattern  = std::string("[\\7][\\11][\\15]");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsContainsTests, HexTest)
@@ -269,8 +324,8 @@ TEST_F(StringsContainsTests, HexTest)
   std::vector<cudf::offset_type> offsets(
     {thrust::make_counting_iterator<cudf::offset_type>(0),
      thrust::make_counting_iterator<cudf::offset_type>(0) + count + 1});
-  auto d_chars   = cudf::detail::make_device_uvector_sync(ascii_chars);
-  auto d_offsets = cudf::detail::make_device_uvector_sync(offsets);
+  auto d_chars   = cudf::detail::make_device_uvector_sync(ascii_chars, cudf::get_default_stream());
+  auto d_offsets = cudf::detail::make_device_uvector_sync(offsets, cudf::get_default_stream());
   auto input     = cudf::make_strings_column(d_chars, d_offsets);
 
   auto strings_view = cudf::strings_column_view(input->view());
@@ -285,10 +340,17 @@ TEST_F(StringsContainsTests, HexTest)
       0, [ch](auto idx) { return ch == static_cast<char>(idx); });
     cudf::test::fixed_width_column_wrapper<bool> expected(true_dat, true_dat + count);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::contains_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
     // also test hex character appearing in character class brackets
     pattern = "[" + pattern + "]";
     results = cudf::strings::contains_re(strings_view, pattern);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    prog    = cudf::strings::regex_program::create(pattern);
+    results = cudf::strings::contains_re(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
@@ -303,36 +365,56 @@ TEST_F(StringsContainsTests, EmbeddedNullCharacter)
   cudf::test::strings_column_wrapper input(data.begin(), data.end());
   auto strings_view = cudf::strings_column_view(input);
 
-  auto results  = cudf::strings::contains_re(strings_view, "A");
+  auto pattern  = std::string("A");
+  auto results  = cudf::strings::contains_re(strings_view, pattern);
   auto expected = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0, 0, 0, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results  = cudf::strings::contains_re(strings_view, "B");
+  pattern  = std::string("B");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 1, 1, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results  = cudf::strings::contains_re(strings_view, "J\\0B");
+  pattern  = std::string("J\\0B");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0, 0, 0, 0, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results  = cudf::strings::contains_re(strings_view, "[G-J][\\0]B");
+  pattern  = std::string("[G-J][\\0]B");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 0, 0, 0, 0, 1, 1, 1, 1});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 
-  results  = cudf::strings::contains_re(strings_view, "[A-D][\\x00]B");
+  pattern  = std::string("[A-D][\\x00]B");
+  results  = cudf::strings::contains_re(strings_view, pattern);
   expected = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1, 0, 0, 0, 0, 0, 0});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::contains_re(strings_view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsContainsTests, Errors)
 {
-  cudf::test::strings_column_wrapper input({"3", "33"});
-  auto strings_view = cudf::strings_column_view(input);
+  EXPECT_THROW(cudf::strings::regex_program::create("(3?)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("(?:3?)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("3?+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("{3}a"), cudf::logic_error);
 
-  EXPECT_THROW(cudf::strings::contains_re(strings_view, "(3?)+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::contains_re(strings_view, "(?:3?)+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::contains_re(strings_view, "3?+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::count_re(strings_view, "{3}a"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("aaaa{1234,5678}"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("aaaa{123,5678}"), cudf::logic_error);
 }
 
 TEST_F(StringsContainsTests, CountTest)
@@ -340,36 +422,37 @@ TEST_F(StringsContainsTests, CountTest)
   std::vector<const char*> h_strings{
     "The quick brown @fox jumps ovér the", "lazy @dog", "1:2:3:4", "00:0:00", nullptr, ""};
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
   auto strings_view = cudf::strings_column_view(strings);
   {
-    auto results         = cudf::strings::count_re(strings_view, "[tT]he");
-    int32_t h_expected[] = {2, 0, 0, 0, 0, 0};
+    auto pattern = std::string("[tT]he");
+    auto results = cudf::strings::count_re(strings_view, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected(
-      h_expected,
-      h_expected + h_strings.size(),
-      thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+      {2, 0, 0, 0, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(strings_view, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results         = cudf::strings::count_re(strings_view, "@\\w+");
-    int32_t h_expected[] = {1, 1, 0, 0, 0, 0};
+    auto pattern = std::string("@\\w+");
+    auto results = cudf::strings::count_re(strings_view, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected(
-      h_expected,
-      h_expected + h_strings.size(),
-      thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+      {1, 1, 0, 0, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(strings_view, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results         = cudf::strings::count_re(strings_view, "\\d+:\\d+");
-    int32_t h_expected[] = {0, 0, 2, 1, 0, 0};
+    auto pattern = std::string("\\d+:\\d+");
+    auto results = cudf::strings::count_re(strings_view, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected(
-      h_expected,
-      h_expected + h_strings.size(),
-      thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+      {0, 0, 2, 1, 0, 0}, cudf::test::iterators::nulls_from_nullptrs(h_strings));
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(strings_view, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
@@ -381,67 +464,90 @@ TEST_F(StringsContainsTests, FixedQuantifier)
 
   {
     // exact match
-    auto results = cudf::strings::count_re(sv, "a{3}");
+    auto pattern = std::string("a{3}");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 0, 1, 1, 1, 2});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // range match (greedy quantifier)
-    auto results = cudf::strings::count_re(sv, "a{3,5}");
+    auto pattern = std::string("a{3,5}");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 0, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // minimum match (greedy quantifier)
-    auto results = cudf::strings::count_re(sv, "a{2,}");
+    auto pattern = std::string("a{2,}");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 1, 1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // range match (lazy quantifier)
-    auto results = cudf::strings::count_re(sv, "a{2,4}?");
+    auto pattern = std::string("a{2,4}?");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 1, 1, 2, 2, 3});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // minimum match (lazy quantifier)
-    auto results = cudf::strings::count_re(sv, "a{1,}?");
+    auto pattern = std::string("a{1,}?");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({1, 2, 3, 4, 5, 6});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // zero match
-    auto results = cudf::strings::count_re(sv, "aaaa{0}");
+    auto pattern = std::string("aaaa{0}");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 0, 1, 1, 1, 2});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     // poorly formed
-    auto results = cudf::strings::count_re(sv, "aaaa{n,m}");
+    auto pattern = std::string("aaaa{n,m}");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 0, 0, 0, 0, 0});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    EXPECT_THROW(cudf::strings::count_re(sv, "aaaa{1234,5678}"), cudf::logic_error);
-    EXPECT_THROW(cudf::strings::count_re(sv, "aaaa{123,5678}"), cudf::logic_error);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
 TEST_F(StringsContainsTests, QuantifierErrors)
 {
-  auto input = cudf::test::strings_column_wrapper({"a", "aa", "aaa", "aaaa", "aaaaa", "aaaaaa"});
-  auto sv    = cudf::strings_column_view(input);
-
-  EXPECT_THROW(cudf::strings::contains_re(sv, "^+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::count_re(sv, "$+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::count_re(sv, "(^)+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::contains_re(sv, "($)+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::count_re(sv, "\\A+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::count_re(sv, "\\Z+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::contains_re(sv, "(\\A)+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::contains_re(sv, "(\\Z)+"), cudf::logic_error);
-
-  EXPECT_THROW(cudf::strings::contains_re(sv, "(^($))+"), cudf::logic_error);
-  EXPECT_NO_THROW(cudf::strings::contains_re(sv, "(^a($))+"));
-  EXPECT_NO_THROW(cudf::strings::count_re(sv, "(^(a$))+"));
+  EXPECT_THROW(cudf::strings::regex_program::create("^+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("$+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("(^)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("($)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("\\A+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("\\Z+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("(\\A)+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("(\\Z)+"), cudf::logic_error);
+
+  EXPECT_THROW(cudf::strings::regex_program::create("(^($))+"), cudf::logic_error);
+  EXPECT_NO_THROW(cudf::strings::regex_program::create("(^a($))+"));
+  EXPECT_NO_THROW(cudf::strings::regex_program::create("(^(a$))+"));
 }
 
 TEST_F(StringsContainsTests, OverlappedClasses)
@@ -450,14 +556,22 @@ TEST_F(StringsContainsTests, OverlappedClasses)
   auto sv = cudf::strings_column_view(input);
 
   {
-    auto results = cudf::strings::count_re(sv, "[e-gb-da-c]");
+    auto pattern = std::string("[e-gb-da-c]");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({7, 4, 0, 0, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results = cudf::strings::count_re(sv, "[á-éê-ú]");
+    auto pattern = std::string("[á-éê-ú]");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({0, 1, 0, 6, 0});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
@@ -467,14 +581,22 @@ TEST_F(StringsContainsTests, NegatedClasses)
   auto sv    = cudf::strings_column_view(input);
 
   {
-    auto results = cudf::strings::count_re(sv, "[^a-f]");
+    auto pattern = std::string("[^a-f]");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({1, 4, 0, 5, 3});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
-    auto results = cudf::strings::count_re(sv, "[^a-eá-é]");
+    auto pattern = std::string("[^a-eá-é]");
+    auto results = cudf::strings::count_re(sv, pattern);
     cudf::test::fixed_width_column_wrapper<int32_t> expected({2, 5, 0, 1, 3});
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::count_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
 
@@ -487,14 +609,18 @@ TEST_F(StringsContainsTests, IncompleteClassesRange)
     cudf::test::fixed_width_column_wrapper<bool> expected({1, 0, 0, 1, 1});
     auto results = cudf::strings::contains_re(sv, "[a-z]");
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    results = cudf::strings::contains_re(sv, "[a-m-z]");  // same as [a-z]
+
+    auto prog = cudf::strings::regex_program::create("[a-m-z]");  // same as [a-z]
+    results   = cudf::strings::contains_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
     cudf::test::fixed_width_column_wrapper<bool> expected({1, 1, 0, 1, 1});
     auto results = cudf::strings::contains_re(sv, "[g-]");
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    results = cudf::strings::contains_re(sv, "[-k]");
+
+    auto prog = cudf::strings::regex_program::create("[-k]");
+    results   = cudf::strings::contains_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
   {
@@ -503,9 +629,12 @@ TEST_F(StringsContainsTests, IncompleteClassesRange)
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
     results = cudf::strings::contains_re(sv, "[+--]");
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    results = cudf::strings::contains_re(sv, "[a-c-]");
+
+    auto prog = cudf::strings::regex_program::create("[a-c-]");
+    results   = cudf::strings::contains_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
-    results = cudf::strings::contains_re(sv, "[-d-f]");
+    prog    = cudf::strings::regex_program::create("[-d-f]");
+    results = cudf::strings::contains_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   }
 }
@@ -516,26 +645,43 @@ TEST_F(StringsContainsTests, MultiLine)
     cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
   auto view = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::contains_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
+  auto pattern = std::string("^abc$");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto prog_ml =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE);
+
+  auto results = cudf::strings::contains_re(view, pattern, cudf::strings::regex_flags::MULTILINE);
   auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
-  results           = cudf::strings::contains_re(view, "^abc$");
+  results = cudf::strings::contains_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+  results           = cudf::strings::contains_re(view, pattern);
   expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+  results = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
 
-  results = cudf::strings::matches_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
+  results = cudf::strings::matches_re(view, pattern, cudf::strings::regex_flags::MULTILINE);
   auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
-  results          = cudf::strings::matches_re(view, "^abc$");
+  results = cudf::strings::matches_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+  results          = cudf::strings::matches_re(view, pattern);
   expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+  results = cudf::strings::matches_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
 
-  results = cudf::strings::count_re(view, "^abc$", cudf::strings::regex_flags::MULTILINE);
+  results = cudf::strings::count_re(view, pattern, cudf::strings::regex_flags::MULTILINE);
   auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0, 1});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
-  results        = cudf::strings::count_re(view, "^abc$");
+  results = cudf::strings::count_re(view, *prog_ml);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  results        = cudf::strings::count_re(view, pattern);
   expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  results = cudf::strings::count_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
 TEST_F(StringsContainsTests, DotAll)
@@ -543,31 +689,55 @@ TEST_F(StringsContainsTests, DotAll)
   auto input = cudf::test::strings_column_wrapper({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""});
   auto view  = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::contains_re(view, "a.*f", cudf::strings::regex_flags::DOTALL);
+  auto pattern = std::string("a.*f");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+  auto prog_dotall =
+    cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL);
+
+  auto results = cudf::strings::contains_re(view, pattern, cudf::strings::regex_flags::DOTALL);
   auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
-  results           = cudf::strings::contains_re(view, "a.*f");
+  results = cudf::strings::contains_re(view, *prog_dotall);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+  results           = cudf::strings::contains_re(view, pattern);
   expected_contains = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+  results = cudf::strings::contains_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
 
-  results = cudf::strings::matches_re(view, "a.*f", cudf::strings::regex_flags::DOTALL);
+  results = cudf::strings::matches_re(view, pattern, cudf::strings::regex_flags::DOTALL);
   auto expected_matches = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
-  results          = cudf::strings::matches_re(view, "a.*f");
+  results = cudf::strings::matches_re(view, *prog_dotall);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+  results          = cudf::strings::matches_re(view, pattern);
   expected_matches = cudf::test::fixed_width_column_wrapper<bool>({0, 0, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+  results = cudf::strings::matches_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_matches);
+
+  pattern     = std::string("a.*?f");
+  prog        = cudf::strings::regex_program::create(pattern);
+  prog_dotall = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL);
 
-  results             = cudf::strings::count_re(view, "a.*?f", cudf::strings::regex_flags::DOTALL);
+  results             = cudf::strings::count_re(view, pattern, cudf::strings::regex_flags::DOTALL);
   auto expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
-  results        = cudf::strings::count_re(view, "a.*?f");
+  results = cudf::strings::count_re(view, *prog_dotall);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  results        = cudf::strings::count_re(view, pattern);
   expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({0, 0, 1, 0});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  results = cudf::strings::count_re(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 
-  auto both_flags = cudf::strings::regex_flags::DOTALL | cudf::strings::regex_flags::MULTILINE;
-  results =
-    cudf::strings::count_re(view, "a.*?f", static_cast<cudf::strings::regex_flags>(both_flags));
-  expected_count = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
+  auto both_flags = static_cast<cudf::strings::regex_flags>(cudf::strings::regex_flags::DOTALL |
+                                                            cudf::strings::regex_flags::MULTILINE);
+  results         = cudf::strings::count_re(view, pattern, both_flags);
+  expected_count  = cudf::test::fixed_width_column_wrapper<int32_t>({2, 1, 1, 0});
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
+  auto prog_both = cudf::strings::regex_program::create(pattern, both_flags);
+  results        = cudf::strings::count_re(view, *prog_both);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_count);
 }
 
@@ -586,9 +756,16 @@ TEST_F(StringsContainsTests, ASCII)
     auto results = cudf::strings::contains_re(view, ptn, cudf::strings::regex_flags::ASCII);
     auto expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 0, 0, 0});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+    auto prog = cudf::strings::regex_program::create(ptn, cudf::strings::regex_flags::ASCII);
+    results   = cudf::strings::contains_re(view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+
     results           = cudf::strings::contains_re(view, ptn);
     expected_contains = cudf::test::fixed_width_column_wrapper<bool>({1, 1, 1, 1});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
+    prog    = cudf::strings::regex_program::create(ptn);
+    results = cudf::strings::contains_re(view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_contains);
   }
 }
 
diff --git a/cpp/tests/strings/datetime_tests.cpp b/cpp/tests/strings/datetime_tests.cpp
index d8203917d4c..dc42fb283dd 100644
--- a/cpp/tests/strings/datetime_tests.cpp
+++ b/cpp/tests/strings/datetime_tests.cpp
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/strings/convert/convert_datetime.hpp>
 #include <cudf/strings/convert/convert_durations.hpp>
 #include <cudf/strings/strings_column_view.hpp>
@@ -21,11 +25,6 @@
 #include <cudf/wrappers/durations.hpp>
 #include <cudf/wrappers/timestamps.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
-
 #include <thrust/iterator/transform_iterator.h>
 
 #include <vector>
@@ -431,12 +430,14 @@ TEST_F(StringsDatetimeTest, FromTimestampDayOfYear)
 
 // Format names used for some specifiers in from_timestamps
 // clang-format off
-cudf::test::strings_column_wrapper format_names({"AM", "PM",
-  "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
-  "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat",
-  "January", "February", "March", "April", "May", "June", "July",
-  "August", "September", "October", "November", "December",
-  "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"});
+cudf::test::strings_column_wrapper format_names() {
+  return cudf::test::strings_column_wrapper({"AM", "PM",
+    "Sunday", "Monday", "Tuesday", "Wednesday", "Thursday", "Friday", "Saturday",
+    "Sun", "Mon", "Tue", "Wed", "Thu", "Fri", "Sat",
+    "January", "February", "March", "April", "May", "June", "July",
+    "August", "September", "October", "November", "December",
+    "Jan", "Feb", "Mar", "Apr", "May", "Jun", "Jul", "Aug", "Sep", "Oct", "Nov", "Dec"});
+}
 // clang-format on
 
 TEST_F(StringsDatetimeTest, FromTimestampDayOfWeekOfYear)
@@ -493,8 +494,9 @@ TEST_F(StringsDatetimeTest, FromTimestampDayOfWeekOfYear)
      "[Fri 01, Jan 1982  5  00  5  00  1981  53]", "[Sat 02, Jan 1982  6  00  6  00  1981  53]",
      "[Sun 03, Jan 1982  0  00  7  01  1981  53]"});
 
-  auto results = cudf::strings::from_timestamps(
-    timestamps, "[%a %d, %b %Y  %w  %W  %u  %U  %G  %V]", cudf::strings_column_view(format_names));
+  auto results = cudf::strings::from_timestamps(timestamps,
+                                                "[%a %d, %b %Y  %w  %W  %u  %U  %G  %V]",
+                                                cudf::strings_column_view(format_names()));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
@@ -529,7 +531,7 @@ TEST_F(StringsDatetimeTest, FromTimestampWeekdayMonthYear)
                                                "[Monday December 06, 2021: 02 AM]"});
 
   auto results = cudf::strings::from_timestamps(
-    timestamps, "[%A %B %d, %Y: %I %p]", cudf::strings_column_view(format_names));
+    timestamps, "[%A %B %d, %Y: %I %p]", cudf::strings_column_view(format_names()));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
@@ -550,7 +552,7 @@ TEST_F(StringsDatetimeTest, FromTimestampAllSpecifiers)
   auto results = cudf::strings::from_timestamps(
     input,
     "[%d/%m/%y/%Y %H:%I:%M:%S.%f %z:%Z %j %u %U %W %V %G %p %a %A %b %B]",
-    cudf::strings_column_view(format_names));
+    cudf::strings_column_view(format_names()));
 
   // clang-format off
   cudf::test::strings_column_wrapper expected({
@@ -573,7 +575,7 @@ TEST_F(StringsDatetimeTest, ZeroSizeStringsColumn)
   cudf::column_view zero_size_column(
     cudf::data_type{cudf::type_id::TIMESTAMP_SECONDS}, 0, nullptr, nullptr, 0);
   auto results = cudf::strings::from_timestamps(zero_size_column);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
diff --git a/cpp/tests/strings/durations_tests.cpp b/cpp/tests/strings/durations_tests.cpp
index 523c64159f4..ac971aa300d 100644
--- a/cpp/tests/strings/durations_tests.cpp
+++ b/cpp/tests/strings/durations_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,14 +14,13 @@
  * limitations under the License.
  */
 
-#include <cudf/strings/convert/convert_durations.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/wrappers/durations.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/strings/convert/convert_durations.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/wrappers/durations.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 #include <vector>
@@ -732,7 +731,7 @@ TEST_F(StringsDurationsTest, ZeroSizeStringsColumn)
   cudf::column_view zero_size_column(
     cudf::data_type{cudf::type_id::DURATION_SECONDS}, 0, nullptr, nullptr, 0);
   auto results = cudf::strings::from_durations(zero_size_column);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
diff --git a/cpp/tests/strings/extract_tests.cpp b/cpp/tests/strings/extract_tests.cpp
index 49a0c51e14f..62d7ef2a418 100644
--- a/cpp/tests/strings/extract_tests.cpp
+++ b/cpp/tests/strings/extract_tests.cpp
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -23,6 +21,7 @@
 
 #include <cudf/detail/iterator.cuh>
 #include <cudf/strings/extract.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table_view.hpp>
 
@@ -78,6 +77,10 @@ TEST_F(StringsExtractTests, ExtractTest)
   columns.push_back(expected2.release());
   cudf::table expected(std::move(columns));
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::extract(strings_view, pattern);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
 TEST_F(StringsExtractTests, ExtractDomainTest)
@@ -119,6 +122,10 @@ TEST_F(StringsExtractTests, ExtractDomainTest)
   });
   cudf::table_view expected{{expected1}};
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::extract(strings_view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
 TEST_F(StringsExtractTests, ExtractEventTest)
@@ -146,9 +153,13 @@ TEST_F(StringsExtractTests, ExtractEventTest)
                                       "Test Message Description"});
 
   for (std::size_t idx = 0; idx < patterns.size(); ++idx) {
-    auto results = cudf::strings::extract(strings_view, patterns[idx]);
+    auto pattern = patterns[idx];
+    auto results = cudf::strings::extract(strings_view, pattern);
     cudf::test::strings_column_wrapper expected({expecteds[idx]});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view().column(0), expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view().column(0), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    results   = cudf::strings::extract(strings_view, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view().column(0), expected);
   }
 }
 
@@ -158,15 +169,24 @@ TEST_F(StringsExtractTests, MultiLine)
     cudf::test::strings_column_wrapper({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
   auto view = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::extract(view, "(^[a-c]+$)", cudf::strings::regex_flags::MULTILINE);
+  auto pattern = std::string("(^[a-c]+$)");
+  auto results = cudf::strings::extract(view, pattern, cudf::strings::regex_flags::MULTILINE);
   cudf::test::strings_column_wrapper expected_multiline({"abc", "abc", "abc", "", "abc"},
                                                         {1, 1, 1, 0, 1});
   auto expected = cudf::table_view{{expected_multiline}};
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
-  results = cudf::strings::extract(view, "^([a-c]+)$");
+  auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE);
+  results   = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+
+  pattern = std::string("^([a-c]+)$");
+  results = cudf::strings::extract(view, pattern);
   cudf::test::strings_column_wrapper expected_default({"", "", "abc", "", ""}, {0, 0, 1, 0, 0});
   expected = cudf::table_view{{expected_default}};
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
 TEST_F(StringsExtractTests, DotAll)
@@ -174,15 +194,23 @@ TEST_F(StringsExtractTests, DotAll)
   auto input = cudf::test::strings_column_wrapper({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""});
   auto view  = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::extract(view, "(a.*f)", cudf::strings::regex_flags::DOTALL);
+  auto pattern = std::string("(a.*f)");
+  auto results = cudf::strings::extract(view, pattern, cudf::strings::regex_flags::DOTALL);
   cudf::test::strings_column_wrapper expected_dotall({"abc\nfa\nef", "abbc\nfff", "abcdef", ""},
                                                      {1, 1, 1, 0});
   auto expected = cudf::table_view{{expected_dotall}};
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
-  results = cudf::strings::extract(view, "(a.*f)");
+  auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL);
+  results   = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+
+  results = cudf::strings::extract(view, pattern);
   cudf::test::strings_column_wrapper expected_default({"", "", "abcdef", ""}, {0, 0, 1, 0});
   expected = cudf::table_view{{expected_default}};
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::extract(view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, expected);
 }
 
 TEST_F(StringsExtractTests, EmptyExtractTest)
@@ -194,7 +222,8 @@ TEST_F(StringsExtractTests, EmptyExtractTest)
     thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
   auto strings_view = cudf::strings_column_view(strings);
 
-  auto results = cudf::strings::extract(strings_view, "([^_]*)\\Z");
+  auto pattern = std::string("([^_]*)\\Z");
+  auto results = cudf::strings::extract(strings_view, pattern);
 
   std::vector<const char*> h_expected{nullptr, "AAA", "A", "", "", ""};
   cudf::test::strings_column_wrapper expected(
@@ -205,6 +234,9 @@ TEST_F(StringsExtractTests, EmptyExtractTest)
   columns.push_back(expected.release());
   cudf::table table_expected(std::move(columns));
   CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::extract(strings_view, *prog);
+  CUDF_TEST_EXPECT_TABLES_EQUAL(*results, table_expected);
 }
 
 TEST_F(StringsExtractTests, ExtractAllTest)
@@ -216,7 +248,8 @@ TEST_F(StringsExtractTests, ExtractAllTest)
   cudf::test::strings_column_wrapper input(h_input.begin(), h_input.end(), validity);
   auto sv = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::extract_all_record(sv, "(\\d+) (\\w+)");
+  auto pattern = std::string("(\\d+) (\\w+)");
+  auto results = cudf::strings::extract_all_record(sv, pattern);
 
   bool valids[] = {true, true, true, false, false, false, true};
   using LCW     = cudf::test::lists_column_wrapper<cudf::string_view>;
@@ -228,15 +261,24 @@ TEST_F(StringsExtractTests, ExtractAllTest)
                 LCW{},
                 LCW{"4", "pare"}},
                valids);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(results->view(), expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::extract_all_record(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
 TEST_F(StringsExtractTests, Errors)
 {
   cudf::test::strings_column_wrapper input({"this column intentionally left blank"});
   auto sv = cudf::strings_column_view(input);
-  EXPECT_THROW(cudf::strings::extract(sv, "\\w+"), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::extract_all_record(sv, "\\w+"), cudf::logic_error);
+
+  auto pattern = std::string("\\w+");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+
+  EXPECT_THROW(cudf::strings::extract(sv, pattern), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::extract(sv, *prog), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::extract_all_record(sv, pattern), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::extract_all_record(sv, *prog), cudf::logic_error);
 }
 
 TEST_F(StringsExtractTests, MediumRegex)
diff --git a/cpp/tests/strings/factories_test.cu b/cpp/tests/strings/factories_test.cu
index 36fdd423168..818cb3cadce 100644
--- a/cpp/tests/strings/factories_test.cu
+++ b/cpp/tests/strings/factories_test.cu
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/copying.hpp>
 #include <cudf/detail/utilities/vector_factories.hpp>
@@ -24,10 +28,6 @@
 #include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
 
 #include <rmm/device_uvector.hpp>
 #include <rmm/exec_policy.hpp>
@@ -59,7 +59,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     memsize += *itr ? (cudf::size_type)strlen(*itr) : 0;
   cudf::size_type count = (cudf::size_type)h_test_strings.size();
   thrust::host_vector<char> h_buffer(memsize);
-  rmm::device_uvector<char> d_buffer(memsize, cudf::default_stream_value);
+  rmm::device_uvector<char> d_buffer(memsize, cudf::get_default_stream());
   thrust::host_vector<thrust::pair<const char*, cudf::size_type>> strings(count);
   thrust::host_vector<cudf::size_type> h_offsets(count + 1);
   cudf::size_type offset = 0;
@@ -78,7 +78,7 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
     }
     h_offsets[idx + 1] = offset;
   }
-  auto d_strings = cudf::detail::make_device_uvector_sync(strings);
+  auto d_strings = cudf::detail::make_device_uvector_sync(strings, cudf::get_default_stream());
   CUDF_CUDA_TRY(cudaMemcpy(d_buffer.data(), h_buffer.data(), memsize, cudaMemcpyHostToDevice));
   auto column = cudf::make_strings_column(d_strings);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
@@ -97,12 +97,12 @@ TEST_F(StringsFactoriesTest, CreateColumnFromPair)
   // check string data
   auto h_chars_data = cudf::detail::make_std_vector_sync(
     cudf::device_span<char const>(strings_view.chars().data<char>(), strings_view.chars().size()),
-    cudf::default_stream_value);
+    cudf::get_default_stream());
   auto h_offsets_data = cudf::detail::make_std_vector_sync(
     cudf::device_span<cudf::offset_type const>(
       strings_view.offsets().data<cudf::offset_type>() + strings_view.offset(),
       strings_view.size() + 1),
-    cudf::default_stream_value);
+    cudf::get_default_stream());
   EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
   EXPECT_EQ(
     memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size() * sizeof(cudf::size_type)), 0);
@@ -143,9 +143,9 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   }
 
   std::vector<cudf::bitmask_type> h_nulls{h_null_mask};
-  auto d_buffer  = cudf::detail::make_device_uvector_sync(h_buffer);
-  auto d_offsets = cudf::detail::make_device_uvector_sync(h_offsets);
-  auto d_nulls   = cudf::detail::make_device_uvector_sync(h_nulls);
+  auto d_buffer  = cudf::detail::make_device_uvector_sync(h_buffer, cudf::get_default_stream());
+  auto d_offsets = cudf::detail::make_device_uvector_sync(h_offsets, cudf::get_default_stream());
+  auto d_nulls   = cudf::detail::make_device_uvector_sync(h_nulls, cudf::get_default_stream());
   auto column    = cudf::make_strings_column(d_buffer, d_offsets, d_nulls, null_count);
   EXPECT_EQ(column->type(), cudf::data_type{cudf::type_id::STRING});
   EXPECT_EQ(column->null_count(), null_count);
@@ -159,12 +159,12 @@ TEST_F(StringsFactoriesTest, CreateColumnFromOffsets)
   // check string data
   auto h_chars_data = cudf::detail::make_std_vector_sync(
     cudf::device_span<char const>(strings_view.chars().data<char>(), strings_view.chars().size()),
-    cudf::default_stream_value);
+    cudf::get_default_stream());
   auto h_offsets_data = cudf::detail::make_std_vector_sync(
     cudf::device_span<cudf::offset_type const>(
       strings_view.offsets().data<cudf::offset_type>() + strings_view.offset(),
       strings_view.size() + 1),
-    cudf::default_stream_value);
+    cudf::get_default_stream());
   EXPECT_EQ(memcmp(h_buffer.data(), h_chars_data.data(), h_buffer.size()), 0);
   EXPECT_EQ(
     memcmp(h_offsets.data(), h_offsets_data.data(), h_offsets.size() * sizeof(cudf::size_type)), 0);
@@ -183,17 +183,18 @@ TEST_F(StringsFactoriesTest, CreateScalar)
 
 TEST_F(StringsFactoriesTest, EmptyStringsColumn)
 {
-  rmm::device_uvector<char> d_chars{0, cudf::default_stream_value};
-  auto d_offsets = cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(1);
-  rmm::device_uvector<cudf::bitmask_type> d_nulls{0, cudf::default_stream_value};
+  rmm::device_uvector<char> d_chars{0, cudf::get_default_stream()};
+  auto d_offsets =
+    cudf::detail::make_zeroed_device_uvector_sync<cudf::size_type>(1, cudf::get_default_stream());
+  rmm::device_uvector<cudf::bitmask_type> d_nulls{0, cudf::get_default_stream()};
 
   auto results = cudf::make_strings_column(d_chars, d_offsets, d_nulls, 0);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 
   rmm::device_uvector<thrust::pair<const char*, cudf::size_type>> d_strings{
-    0, cudf::default_stream_value};
+    0, cudf::get_default_stream()};
   results = cudf::make_strings_column(d_strings);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 namespace {
@@ -213,8 +214,8 @@ TEST_F(StringsFactoriesTest, StringPairWithNullsAndEmpty)
     {0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1});
 
   auto d_column = cudf::column_device_view::create(data);
-  rmm::device_uvector<string_pair> pairs(d_column->size(), cudf::default_stream_value);
-  thrust::transform(rmm::exec_policy(cudf::default_stream_value),
+  rmm::device_uvector<string_pair> pairs(d_column->size(), cudf::get_default_stream());
+  thrust::transform(rmm::exec_policy(cudf::get_default_stream()),
                     d_column->pair_begin<cudf::string_view, true>(),
                     d_column->pair_end<cudf::string_view, true>(),
                     pairs.data(),
diff --git a/cpp/tests/strings/fill_tests.cpp b/cpp/tests/strings/fill_tests.cpp
index 721fb6d8d33..c3a1710bb83 100644
--- a/cpp/tests/strings/fill_tests.cpp
+++ b/cpp/tests/strings/fill_tests.cpp
@@ -14,17 +14,14 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column_factories.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/detail/fill.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
+#include <cudf_test/iterator_utilities.hpp>
 
-#include <thrust/iterator/transform_iterator.h>
+#include <cudf/column/column_factories.hpp>
+#include <cudf/filling.hpp>
+#include <cudf/scalar/scalar.hpp>
 
 #include <vector>
 
@@ -34,46 +31,40 @@ struct StringsFillTest : public cudf::test::BaseFixture {
 TEST_F(StringsFillTest, Fill)
 {
   std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  cudf::strings_column_view view(strings);
+  cudf::test::strings_column_wrapper input(
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
+
   {
-    auto results = cudf::strings::detail::fill(view, 1, 5, cudf::string_scalar("zz"));
+    auto results = cudf::fill(input, 1, 5, cudf::string_scalar("zz"));
 
     std::vector<const char*> h_expected{"eee", "zz", "zz", "zz", "zz", "bbb", "ééé"};
     cudf::test::strings_column_wrapper expected(
-      h_expected.begin(),
-      h_expected.end(),
-      thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+      h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results = cudf::strings::detail::fill(view, 2, 4, cudf::string_scalar("", false));
+    auto results = cudf::fill(input, 2, 4, cudf::string_scalar("", false));
 
     std::vector<const char*> h_expected{"eee", "bb", nullptr, nullptr, "aa", "bbb", "ééé"};
     cudf::test::strings_column_wrapper expected(
-      h_expected.begin(),
-      h_expected.end(),
-      thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+      h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results = cudf::strings::detail::fill(view, 5, 5, cudf::string_scalar("zz"));
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, view.parent());
+    auto results = cudf::fill(input, 5, 5, cudf::string_scalar("zz"));
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input);
   }
   {
-    auto results = cudf::strings::detail::fill(view, 0, 7, cudf::string_scalar(""));
+    auto results = cudf::fill(input, 0, 7, cudf::string_scalar(""));
     cudf::test::strings_column_wrapper expected({"", "", "", "", "", "", ""},
                                                 {1, 1, 1, 1, 1, 1, 1});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
   {
-    auto results = cudf::strings::detail::fill(view, 0, 7, cudf::string_scalar("", false));
+    auto results = cudf::fill(input, 0, 7, cudf::string_scalar("", false));
     cudf::test::strings_column_wrapper expected({"", "", "", "", "", "", ""},
                                                 {0, 0, 0, 0, 0, 0, 0});
-    CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
   }
 }
 
@@ -81,20 +72,16 @@ TEST_F(StringsFillTest, ZeroSizeStringsColumns)
 {
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
-  auto results = cudf::strings::detail::fill(
-    cudf::strings_column_view(zero_size_strings_column), 0, 1, cudf::string_scalar(""));
-  cudf::test::expect_strings_empty(results->view());
+  auto results = cudf::fill(zero_size_strings_column, 0, 0, cudf::string_scalar(""));
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsFillTest, FillRangeError)
 {
   std::vector<const char*> h_strings{"eee", "bb", nullptr, "", "aa", "bbb", "ééé"};
-  cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  cudf::strings_column_view view(strings);
+  cudf::test::strings_column_wrapper input(
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
 
-  EXPECT_THROW(cudf::strings::detail::fill(view, 5, 1, cudf::string_scalar("")), cudf::logic_error);
-  EXPECT_THROW(cudf::strings::detail::fill(view, 5, 9, cudf::string_scalar("")), cudf::logic_error);
+  EXPECT_THROW(cudf::fill(input, 5, 1, cudf::string_scalar("")), cudf::logic_error);
+  EXPECT_THROW(cudf::fill(input, 5, 9, cudf::string_scalar("")), cudf::logic_error);
 }
diff --git a/cpp/tests/strings/findall_tests.cpp b/cpp/tests/strings/findall_tests.cpp
index b55d0977215..6428be28e0a 100644
--- a/cpp/tests/strings/findall_tests.cpp
+++ b/cpp/tests/strings/findall_tests.cpp
@@ -14,13 +14,14 @@
  * limitations under the License.
  */
 
-#include <cudf/strings/findall.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/table_utilities.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/strings/findall.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -35,8 +36,10 @@ TEST_F(StringsFindallTests, FindallTest)
   cudf::test::strings_column_wrapper input(
     {"3-A", "4-May 5-Day 6-Hay", "12-Dec-2021-Jan", "Feb-March", "4 ABC", "", "", "25-9000-Hal"},
     valids);
+  auto sv = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::findall(cudf::strings_column_view(input), "(\\d+)-(\\w+)");
+  auto pattern = std::string("(\\d+)-(\\w+)");
+  auto results = cudf::strings::findall(sv, pattern);
 
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{"3-A"},
@@ -49,6 +52,9 @@ TEST_F(StringsFindallTests, FindallTest)
                 LCW{"25-9000"}},
                valids);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::findall(sv, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
 TEST_F(StringsFindallTests, Multiline)
@@ -56,10 +62,14 @@ TEST_F(StringsFindallTests, Multiline)
   cudf::test::strings_column_wrapper input({"abc\nfff\nabc", "fff\nabc\nlll", "abc", "", "abc\n"});
   auto view = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::findall(view, "(^abc$)", cudf::strings::regex_flags::MULTILINE);
+  auto pattern = std::string("(^abc$)");
+  auto results = cudf::strings::findall(view, pattern, cudf::strings::regex_flags::MULTILINE);
   using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{"abc", "abc"}, LCW{"abc"}, LCW{"abc"}, LCW{}, LCW{"abc"}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::MULTILINE);
+  results   = cudf::strings::findall(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
 TEST_F(StringsFindallTests, DotAll)
@@ -67,10 +77,14 @@ TEST_F(StringsFindallTests, DotAll)
   cudf::test::strings_column_wrapper input({"abc\nfa\nef", "fff\nabbc\nfff", "abcdef", ""});
   auto view = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::findall(view, "(b.*f)", cudf::strings::regex_flags::DOTALL);
+  auto pattern = std::string("(b.*f)");
+  auto results = cudf::strings::findall(view, pattern, cudf::strings::regex_flags::DOTALL);
   using LCW    = cudf::test::lists_column_wrapper<cudf::string_view>;
   LCW expected({LCW{"bc\nfa\nef"}, LCW{"bbc\nfff"}, LCW{"bcdef"}, LCW{}});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
+  auto prog = cudf::strings::regex_program::create(pattern, cudf::strings::regex_flags::DOTALL);
+  results   = cudf::strings::findall(view, *prog);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(results->view(), expected);
 }
 
 TEST_F(StringsFindallTests, MediumRegex)
diff --git a/cpp/tests/strings/fixed_point_tests.cpp b/cpp/tests/strings/fixed_point_tests.cpp
index 81122b1c5d8..15c12421dd9 100644
--- a/cpp/tests/strings/fixed_point_tests.cpp
+++ b/cpp/tests/strings/fixed_point_tests.cpp
@@ -14,17 +14,16 @@
  * limitations under the License.
  */
 
-#include <cudf/fixed_point/fixed_point.hpp>
-#include <cudf/strings/convert/convert_fixed_point.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
-#include <limits>
 
-#include <tests/strings/utilities.h>
+#include <cudf/fixed_point/fixed_point.hpp>
+#include <cudf/strings/convert/convert_fixed_point.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+
+#include <limits>
 
 struct StringsConvertTest : public cudf::test::BaseFixture {
 };
@@ -224,7 +223,7 @@ TEST_F(StringsConvertTest, ZeroSizeStringsColumnFixedPoint)
   auto zero_size_column = cudf::make_empty_column(cudf::data_type{cudf::type_id::DECIMAL32});
 
   auto results = cudf::strings::from_fixed_point(zero_size_column->view());
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConvertTest, ZeroSizeFixedPointColumn)
diff --git a/cpp/tests/strings/floats_tests.cpp b/cpp/tests/strings/floats_tests.cpp
index 360ea8be178..1a3c5ada04f 100644
--- a/cpp/tests/strings/floats_tests.cpp
+++ b/cpp/tests/strings/floats_tests.cpp
@@ -14,13 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/strings/convert/convert_floats.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/strings/convert/convert_floats.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -188,7 +187,7 @@ TEST_F(StringsConvertTest, ZeroSizeStringsColumnFloat)
   cudf::column_view zero_size_column(
     cudf::data_type{cudf::type_id::FLOAT32}, 0, nullptr, nullptr, 0);
   auto results = cudf::strings::from_floats(zero_size_column);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConvertTest, ZeroSizeFloatsColumn)
diff --git a/cpp/tests/strings/format_lists_tests.cpp b/cpp/tests/strings/format_lists_tests.cpp
index 63fcdf6f00e..f1ab90ee9c5 100644
--- a/cpp/tests/strings/format_lists_tests.cpp
+++ b/cpp/tests/strings/format_lists_tests.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021, NVIDIA CORPORATION.
+ * Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -14,8 +14,6 @@
  * limitations under the License.
  */
 
-#include <tests/strings/utilities.h>
-
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
@@ -36,7 +34,7 @@ TEST_F(StringsFormatListsTest, EmptyList)
   auto const view  = cudf::lists_column_view(input);
 
   auto results = cudf::strings::format_list_column(view);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsFormatListsTest, EmptyNestedList)
diff --git a/cpp/tests/strings/integers_tests.cpp b/cpp/tests/strings/integers_tests.cpp
index 5802a1ddc0a..52c3638d338 100644
--- a/cpp/tests/strings/integers_tests.cpp
+++ b/cpp/tests/strings/integers_tests.cpp
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
-#include <cudf/strings/convert/convert_integers.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-
-#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 #include <cudf_test/type_lists.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/detail/utilities/vector_factories.hpp>
+#include <cudf/strings/convert/convert_integers.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <rmm/cuda_stream_view.hpp>
 #include <rmm/device_uvector.hpp>
@@ -265,7 +264,7 @@ TEST_F(StringsConvertTest, ZeroSizeStringsColumn)
 {
   cudf::column_view zero_size_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
   auto results = cudf::strings::from_integers(zero_size_column);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsConvertTest, ZeroSizeIntegersColumn)
@@ -298,8 +297,8 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   std::iota(h_integers.begin(), h_integers.end(), -(TypeParam)(h_integers.size() / 2));
   h_integers.push_back(std::numeric_limits<TypeParam>::min());
   h_integers.push_back(std::numeric_limits<TypeParam>::max());
-  auto d_integers    = cudf::detail::make_device_uvector_sync(h_integers);
-  auto integers      = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
+  auto d_integers = cudf::detail::make_device_uvector_sync(h_integers, cudf::get_default_stream());
+  auto integers   = cudf::make_numeric_column(cudf::data_type{cudf::type_to_id<TypeParam>()},
                                             (cudf::size_type)d_integers.size());
   auto integers_view = integers->mutable_view();
   CUDF_CUDA_TRY(cudaMemcpy(integers_view.data<TypeParam>(),
@@ -312,7 +311,7 @@ TYPED_TEST(StringsIntegerConvertTest, FromToInteger)
   auto results_strings = cudf::strings::from_integers(integers->view());
 
   // copy back to host
-  h_integers = cudf::detail::make_host_vector_sync(d_integers);
+  h_integers = cudf::detail::make_host_vector_sync(d_integers, cudf::get_default_stream());
   std::vector<std::string> h_strings;
   for (auto itr = h_integers.begin(); itr != h_integers.end(); ++itr)
     h_strings.push_back(std::to_string(*itr));
diff --git a/cpp/tests/strings/ipv4_tests.cpp b/cpp/tests/strings/ipv4_tests.cpp
index 1bc726edea7..0a404534916 100644
--- a/cpp/tests/strings/ipv4_tests.cpp
+++ b/cpp/tests/strings/ipv4_tests.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/strings/convert/convert_ipv4.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/strings/convert/convert_ipv4.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -75,7 +75,7 @@ TEST_F(StringsConvertTest, ZeroSizeStringsColumnIPV4)
 {
   cudf::column_view zero_size_column(cudf::data_type{cudf::type_id::INT64}, 0, nullptr, nullptr, 0);
   auto results = cudf::strings::integers_to_ipv4(zero_size_column);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
   results = cudf::strings::ipv4_to_integers(results->view());
   EXPECT_EQ(0, results->size());
 }
diff --git a/cpp/tests/strings/json_tests.cpp b/cpp/tests/strings/json_tests.cpp
index c533eed48df..7f6a6422d18 100644
--- a/cpp/tests/strings/json_tests.cpp
+++ b/cpp/tests/strings/json_tests.cpp
@@ -26,7 +26,7 @@
 
 // clang-format off
 std::string json_string{
-  "{" 
+  "{"
     "\"store\": {""\"book\": ["
         "{"
           "\"category\": \"reference\","
@@ -101,7 +101,7 @@ TEST_F(JsonPathTests, GetJsonObjectChildOp)
     auto result     = drop_whitespace(*result_raw);
 
     // clang-format off
-    cudf::test::strings_column_wrapper expected_raw{     
+    cudf::test::strings_column_wrapper expected_raw{
       "{"
         "\"book\": ["
           "{"
@@ -246,7 +246,7 @@ TEST_F(JsonPathTests, GetJsonObjectWildcardOp)
 
     // clang-format off
     cudf::test::strings_column_wrapper expected_raw{
-      "[" 
+      "["
         "{"
           "\"book\": ["
             "{"
@@ -676,7 +676,7 @@ TEST_F(JsonPathTests, MixedOutput)
          "\"z\": {\"i\": 10, \"j\": 100},"
          "\"b\": [\"c\",null,true,-1]"
       "}"
-      }, 
+      },
       {1, 1, 0, 1, 1, 1});
     // clang-format on
 
@@ -708,11 +708,11 @@ TEST_F(JsonPathTests, MixedOutput)
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
-      "c", 
-      "c", 
-      "", 
-      "", 
-      "", 
+      "c",
+      "c",
+      "",
+      "",
+      "",
       "[\"c\",null,true,-1]"},
       {1, 1, 0, 0, 0, 1});
     // clang-format on
@@ -726,11 +726,11 @@ TEST_F(JsonPathTests, MixedOutput)
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
-      "[\"c\"]", 
-      "[\"c\"]", 
-      "", 
-      "[\"y\",500]", 
-      "[]", 
+      "[\"c\"]",
+      "[\"c\"]",
+      "",
+      "[\"y\",500]",
+      "[]",
       "["
         "{\"i\": 10, \"j\": 100},"
         "[\"c\",null,true,-1]"
@@ -747,11 +747,11 @@ TEST_F(JsonPathTests, MixedOutput)
 
     // clang-format off
     cudf::test::strings_column_wrapper expected({
-      "[]", 
-      "[]", 
-      "", 
+      "[]",
+      "[]",
+      "",
+      "",
       "",
-      "",      
       "[\"c\",null,true,-1]"},
       {1, 1, 0, 0, 0, 1});
     // clang-format on
@@ -835,7 +835,7 @@ TEST_F(JsonPathTests, AllowSingleQuotes)
                 "\'b\': [\'c\',null,true,-1]"
               "}"
     "}",
-    
+
     "{"
       "\'a\': \"abc'def\""
     "}",
@@ -869,7 +869,7 @@ TEST_F(JsonPathTests, AllowSingleQuotes)
       "}",
       "abc'def",
       "'abc'def'"
-      }, 
+      },
       {1, 1, 0, 1, 1, 1, 1, 1});
     // clang-format on
 
@@ -1012,4 +1012,4 @@ TEST_F(JsonPathTests, MissingFieldsAsNulls)
   do_test("$.tup[*].array", "[[1,2],[3,4]]", "[[1,2],null,[3,4],null]");
   do_test("$.x[*].array", "", "null", false);
   do_test("$.tup[*].a.x", "[\"5\"]", "[null,null,null,\"5\"]");
-}
\ No newline at end of file
+}
diff --git a/cpp/tests/strings/pad_tests.cpp b/cpp/tests/strings/pad_tests.cpp
index 1ccef58a8f6..c416c2b3ce1 100644
--- a/cpp/tests/strings/pad_tests.cpp
+++ b/cpp/tests/strings/pad_tests.cpp
@@ -14,6 +14,10 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/padding.hpp>
@@ -21,11 +25,6 @@
 #include <cudf/strings/wrap.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
-
 #include <thrust/iterator/transform_iterator.h>
 
 #include <vector>
@@ -103,7 +102,7 @@ TEST_F(StringsPadTest, ZeroSizeStringsColumn)
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::pad(strings_view, 5);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 class PadParameters : public StringsPadTest, public testing::WithParamInterface<cudf::size_type> {
diff --git a/cpp/tests/strings/replace_regex_tests.cpp b/cpp/tests/strings/replace_regex_tests.cpp
index 79d968b14ad..840d998e56c 100644
--- a/cpp/tests/strings/replace_regex_tests.cpp
+++ b/cpp/tests/strings/replace_regex_tests.cpp
@@ -14,12 +14,14 @@
  * limitations under the License.
  */
 
-#include <cudf/strings/replace_re.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
+#include <cudf_test/iterator_utilities.hpp>
+
+#include <cudf/strings/regex/regex_program.hpp>
+#include <cudf/strings/replace_re.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -39,9 +41,7 @@ TEST_F(StringsReplaceRegexTest, ReplaceRegexTest)
                                      nullptr};
 
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
   auto strings_view = cudf::strings_column_view(strings);
 
   std::vector<const char*> h_expected{"= quick brown fox jumps over = lazy dog",
@@ -52,13 +52,15 @@ TEST_F(StringsReplaceRegexTest, ReplaceRegexTest)
                                       "",
                                       nullptr};
 
-  std::string pattern = "(\\bthe\\b)";
-  auto results        = cudf::strings::replace_re(strings_view, pattern, cudf::string_scalar("="));
+  auto pattern = std::string("(\\bthe\\b)");
+  auto repl    = cudf::string_scalar("=");
+  auto results = cudf::strings::replace_re(strings_view, pattern, repl);
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_re(strings_view, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
@@ -72,9 +74,7 @@ TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
                                      nullptr};
 
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
   auto strings_view = cudf::strings_column_view(strings);
 
   std::vector<const char*> h_expected{" quick brown fox jumps over  lazy dog",
@@ -91,101 +91,132 @@ TEST_F(StringsReplaceRegexTest, ReplaceMultiRegexTest)
   auto repls_view = cudf::strings_column_view(repls);
   auto results    = cudf::strings::replace_re(strings_view, patterns, repls_view);
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, InvalidRegex)
 {
-  cudf::test::strings_column_wrapper strings(
-    {"abc*def|ghi+jkl", ""});  // these do not really matter
-  auto strings_view = cudf::strings_column_view(strings);
-
   // these are quantifiers that do not have a preceding character/class
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "*", cudf::string_scalar("")),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "|", cudf::string_scalar("")),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "+", cudf::string_scalar("")),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "ab(*)", cudf::string_scalar("")),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "\\", cudf::string_scalar("")),
-               cudf::logic_error);
-  EXPECT_THROW(cudf::strings::replace_re(strings_view, "\\p", cudf::string_scalar("")),
-               cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("*"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("|"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("+"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("ab(*)"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("\\"), cudf::logic_error);
+  EXPECT_THROW(cudf::strings::regex_program::create("\\p"), cudf::logic_error);
 }
 
 TEST_F(StringsReplaceRegexTest, WithEmptyPattern)
 {
   std::vector<const char*> h_strings{"asd", "xcv"};
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
   auto strings_view = cudf::strings_column_view(strings);
-  std::vector<std::string> patterns({""});
+
+  auto empty_pattern = std::string("");
+  auto repl          = cudf::string_scalar("bbb");
+  std::vector<std::string> patterns({empty_pattern});
   cudf::test::strings_column_wrapper repls({"bbb"});
   auto repls_view = cudf::strings_column_view(repls);
   auto results    = cudf::strings::replace_re(strings_view, patterns, repls_view);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
-  results = cudf::strings::replace_re(strings_view, "", cudf::string_scalar("bbb"));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings);
+  results = cudf::strings::replace_re(strings_view, "", repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings);
+  auto prog = cudf::strings::regex_program::create(empty_pattern);
+  results   = cudf::strings::replace_re(strings_view, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, strings);
 }
 
 TEST_F(StringsReplaceRegexTest, MultiReplacement)
 {
   cudf::test::strings_column_wrapper input({"aba bcd aba", "abababa abababa"});
-  auto results =
-    cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar("_"), 2);
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern = std::string("aba");
+  auto repl    = cudf::string_scalar("_");
+  auto results = cudf::strings::replace_re(sv, pattern, repl, 2);
   cudf::test::strings_column_wrapper expected({"_ bcd _", "_b_ abababa"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
-  results =
-    cudf::strings::replace_re(cudf::strings_column_view(input), "aba", cudf::string_scalar(""), 0);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_re(sv, *prog, repl, 2);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  results = cudf::strings::replace_re(sv, pattern, repl, 0);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input);
+  results = cudf::strings::replace_re(sv, *prog, repl, 0);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, input);
 }
 
 TEST_F(StringsReplaceRegexTest, WordBoundary)
 {
   cudf::test::strings_column_wrapper input({"aba bcd\naba", "zéz", "A1B2-é3", "e é", "_", "a_b"});
-  auto results =
-    cudf::strings::replace_re(cudf::strings_column_view(input), "\\b", cudf::string_scalar("X"));
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern  = std::string("\\b");
+  auto repl     = cudf::string_scalar("X");
+  auto results  = cudf::strings::replace_re(sv, pattern, repl);
   auto expected = cudf::test::strings_column_wrapper(
     {"XabaX XbcdX\nXabaX", "XzézX", "XA1B2X-Xé3X", "XeX XéX", "X_X", "Xa_bX"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
-  results =
-    cudf::strings::replace_re(cudf::strings_column_view(input), "\\B", cudf::string_scalar("X"));
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  pattern  = std::string("\\B");
+  results  = cudf::strings::replace_re(sv, pattern, repl);
   expected = cudf::test::strings_column_wrapper(
     {"aXbXa bXcXd\naXbXa", "zXéXz", "AX1XBX2-éX3", "e é", "_", "aX_Xb"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, Alternation)
 {
   cudf::test::strings_column_wrapper input(
     {"16  6  brr  232323  1  hello  90", "123 ABC 00 2022", "abé123  4567  89xyz"});
-  auto results = cudf::strings::replace_re(
-    cudf::strings_column_view(input), "(^|\\s)\\d+(\\s|$)", cudf::string_scalar("_"));
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern = std::string("(^|\\s)\\d+(\\s|$)");
+  auto repl    = cudf::string_scalar("_");
+  auto results = cudf::strings::replace_re(sv, pattern, repl);
   auto expected =
     cudf::test::strings_column_wrapper({"__ brr __ hello _", "_ABC_2022", "abé123 _ 89xyz"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
-  results = cudf::strings::replace_re(
-    cudf::strings_column_view(input), "(\\s|^)\\d+($|\\s)", cudf::string_scalar("_"));
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  pattern = std::string("(\\s|^)\\d+($|\\s)");
+  results = cudf::strings::replace_re(sv, pattern, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_re(sv, *prog, repl);
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ZeroLengthMatch)
 {
   cudf::test::strings_column_wrapper input({"DD", "zéz", "DsDs", ""});
+  auto sv = cudf::strings_column_view(input);
+
+  auto pattern  = std::string("D*");
   auto repl     = cudf::string_scalar("_");
-  auto results  = cudf::strings::replace_re(cudf::strings_column_view(input), "D*", repl);
+  auto results  = cudf::strings::replace_re(sv, pattern, repl);
   auto expected = cudf::test::strings_column_wrapper({"__", "_z_é_z_", "__s__s_", "_"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
-  results  = cudf::strings::replace_re(cudf::strings_column_view(input), "D?s?", repl);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+
+  pattern  = std::string("D?s?");
+  results  = cudf::strings::replace_re(sv, pattern, repl);
   expected = cudf::test::strings_column_wrapper({"___", "_z_é_z_", "___", "_"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, Multiline)
@@ -196,14 +227,21 @@ TEST_F(StringsReplaceRegexTest, Multiline)
   auto sv = cudf::strings_column_view(input);
 
   // single-replace
-  auto results =
-    cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_"), std::nullopt, multiline);
+  auto pattern = std::string("^aba$");
+  auto repl    = cudf::string_scalar("_");
+  auto results = cudf::strings::replace_re(sv, pattern, repl, std::nullopt, multiline);
   cudf::test::strings_column_wrapper expected_ml({"bcd\n_\nefg", "_\naba abab\n_", "_"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml);
+  auto prog = cudf::strings::regex_program::create(pattern, multiline);
+  results   = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected_ml);
 
-  results = cudf::strings::replace_re(sv, "^aba$", cudf::string_scalar("_"));
+  results = cudf::strings::replace_re(sv, pattern, repl);
   cudf::test::strings_column_wrapper expected({"bcd\naba\nefg", "aba\naba abab\naba", "_"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_re(sv, *prog, repl);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
   // multi-replace
   std::vector<std::string> patterns({"aba$", "^aba"});
@@ -217,15 +255,23 @@ TEST_F(StringsReplaceRegexTest, Multiline)
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, multi_expected);
 
   // backref-replace
-  results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]", multiline);
+  auto repl_template = std::string("[\\1]");
+  pattern            = std::string("(^aba)");
+  results            = cudf::strings::replace_with_backrefs(sv, pattern, repl_template, multiline);
   cudf::test::strings_column_wrapper br_expected_ml(
     {"bcd\n[aba]\nefg", "[aba]\n[aba] abab\n[aba]", "[aba]"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml);
+  prog    = cudf::strings::regex_program::create(pattern, multiline);
+  results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected_ml);
 
-  results = cudf::strings::replace_with_backrefs(sv, "(^aba)", "[\\1]");
+  results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
   cudf::test::strings_column_wrapper br_expected(
     {"bcd\naba\nefg", "[aba]\naba abab\naba", "[aba]"});
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, br_expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
@@ -239,10 +285,8 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
                                      nullptr};
 
   cudf::test::strings_column_wrapper strings(
-    h_strings.begin(),
-    h_strings.end(),
-    thrust::make_transform_iterator(h_strings.begin(), [](auto str) { return str != nullptr; }));
-  auto strings_view = cudf::strings_column_view(strings);
+    h_strings.begin(), h_strings.end(), cudf::test::iterators::nulls_from_nullptrs(h_strings));
+  auto sv = cudf::strings_column_view(strings);
 
   std::vector<const char*> h_expected{"the-quick-brown-fox-jumps-over-the-lazy-dog",
                                       "the-fat-cat-lays-next-to-the-other-accénted-cat",
@@ -252,38 +296,43 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexTest)
                                       "",
                                       nullptr};
 
-  std::string pattern       = "(\\w) (\\w)";
-  std::string repl_template = "\\1-\\2";
-  auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template);
+  auto pattern       = std::string("(\\w) (\\w)");
+  auto repl_template = std::string("\\1-\\2");
+  auto results       = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexAltIndexPatternTest)
 {
-  cudf::test::strings_column_wrapper strings({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"});
-  auto strings_view = cudf::strings_column_view(strings);
+  cudf::test::strings_column_wrapper input({"12-3 34-5 67-89", "0-99: 777-888:: 5673-0"});
+  auto sv = cudf::strings_column_view(input);
 
-  std::string pattern       = "(\\d+)-(\\d+)";
-  std::string repl_template = "${2} X ${1}0";
-  auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template);
+  auto pattern       = std::string("(\\d+)-(\\d+)");
+  auto repl_template = std::string("${2} X ${1}0");
+  auto results       = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
 
   cudf::test::strings_column_wrapper expected(
     {"3 X 120 5 X 340 89 X 670", "99 X 00: 888 X 7770:: 0 X 56730"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest)
 {
   cudf::test::strings_column_wrapper strings(
     {"A543", "Z756", "", "tést-string", "two-thréé four-fivé", "abcd-éfgh", "tést-string-again"});
-  auto strings_view         = cudf::strings_column_view(strings);
-  std::string pattern       = "([a-z])-([a-zé])";
-  std::string repl_template = "X\\2+\\1Z";
-  auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template);
+  auto sv = cudf::strings_column_view(strings);
+
+  auto pattern       = std::string("([a-z])-([a-zé])");
+  auto repl_template = std::string("X\\2+\\1Z");
+  auto results       = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
 
   cudf::test::strings_column_wrapper expected({"A543",
                                                "Z756",
@@ -293,33 +342,45 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexReversedTest)
                                                "abcXé+dZfgh",
                                                "tésXs+tZtrinXa+gZgain"});
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, BackrefWithGreedyQuantifier)
 {
   cudf::test::strings_column_wrapper input(
     {"<h1>title</h1><h2>ABC</h2>", "<h1>1234567</h1><h2>XYZ</h2>"});
-  std::string replacement = "<h2>\\1</h2><p>\\2</p>";
+  auto sv = cudf::strings_column_view(input);
 
-  auto results = cudf::strings::replace_with_backrefs(
-    cudf::strings_column_view(input), "<h1>(.*)</h1><h2>(.*)</h2>", replacement);
+  auto pattern       = std::string("<h1>(.*)</h1><h2>(.*)</h2>");
+  auto repl_template = std::string("<h2>\\1</h2><p>\\2</p>");
+
+  auto results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
   cudf::test::strings_column_wrapper expected(
     {"<h2>title</h2><p>ABC</p>", "<h2>1234567</h2><p>XYZ</p>"});
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 
-  results = cudf::strings::replace_with_backrefs(
-    cudf::strings_column_view(input), "<h1>([a-z\\d]+)</h1><h2>([A-Z]+)</h2>", replacement);
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  pattern = std::string("<h1>([a-z\\d]+)</h1><h2>([A-Z]+)</h2>");
+  results = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
+  prog    = cudf::strings::regex_program::create(pattern);
+  results = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest)
 {
   cudf::test::strings_column_wrapper strings(
     {"TEST123", "TEST1TEST2", "TEST2-TEST1122", "TEST1-TEST-T", "TES3"});
-  auto strings_view         = cudf::strings_column_view(strings);
-  std::string pattern       = "(TEST)(\\d+)";
-  std::string repl_template = "${0}: ${1}, ${2}; ";
-  auto results = cudf::strings::replace_with_backrefs(strings_view, pattern, repl_template);
+  auto sv = cudf::strings_column_view(strings);
+
+  auto pattern       = std::string("(TEST)(\\d+)");
+  auto repl_template = std::string("${0}: ${1}, ${2}; ");
+  auto results       = cudf::strings::replace_with_backrefs(sv, pattern, repl_template);
 
   cudf::test::strings_column_wrapper expected({
     "TEST123: TEST, 123; ",
@@ -329,6 +390,9 @@ TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexZeroIndexTest)
     "TES3",
   });
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+  auto prog = cudf::strings::regex_program::create(pattern);
+  results   = cudf::strings::replace_with_backrefs(sv, *prog, repl_template);
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(*results, expected);
 }
 
 TEST_F(StringsReplaceRegexTest, ReplaceBackrefsRegexErrorTest)
diff --git a/cpp/tests/strings/replace_tests.cpp b/cpp/tests/strings/replace_tests.cpp
index 75c6cfa70e4..da0667f54cf 100644
--- a/cpp/tests/strings/replace_tests.cpp
+++ b/cpp/tests/strings/replace_tests.cpp
@@ -14,16 +14,16 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/strings/detail/replace.hpp>
 #include <cudf/strings/replace.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 
-#include "./utilities.h"
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-
 #include <thrust/iterator/constant_iterator.h>
 #include <thrust/iterator/transform_iterator.h>
 
@@ -51,8 +51,8 @@ struct StringsReplaceTest : public cudf::test::BaseFixture {
 
 TEST_F(StringsReplaceTest, Replace)
 {
-  auto strings      = build_corpus();
-  auto strings_view = cudf::strings_column_view(strings);
+  auto input        = build_corpus();
+  auto strings_view = cudf::strings_column_view(input);
   // replace all occurrences of 'the ' with '++++ '
   std::vector<const char*> h_expected{"++++ quick brown fox jumps over ++++ lazy dog",
                                       "++++ fat cat lays next to ++++ other accénted cat",
@@ -62,24 +62,29 @@ TEST_F(StringsReplaceTest, Replace)
                                       "",
                                       nullptr};
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "));
+    strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "), -1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "));
+    strings_view, cudf::string_scalar("the "), cudf::string_scalar("++++ "), -1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsReplaceTest, ReplaceReplLimit)
 {
-  auto strings      = build_corpus();
-  auto strings_view = cudf::strings_column_view(strings);
+  auto input        = build_corpus();
+  auto strings_view = cudf::strings_column_view(input);
+  auto stream       = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+
   // only remove the first occurrence of 'the '
   std::vector<const char*> h_expected{"quick brown fox jumps over the lazy dog",
                                       "fat cat lays next to the other accénted cat",
@@ -89,23 +94,21 @@ TEST_F(StringsReplaceTest, ReplaceReplLimit)
                                       "",
                                       nullptr};
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1);
+    strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1);
+    strings_view, cudf::string_scalar("the "), cudf::string_scalar(""), 1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsReplaceTest, ReplaceReplLimitInputSliced)
 {
-  auto strings = build_corpus();
+  auto input = build_corpus();
   // replace first two occurrences of ' ' with '--'
   std::vector<const char*> h_expected{"the--quick--brown fox jumps over the lazy dog",
                                       "the--fat--cat lays next to the other accénted cat",
@@ -115,11 +118,11 @@ TEST_F(StringsReplaceTest, ReplaceReplLimitInputSliced)
                                       "",
                                       nullptr};
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
   std::vector<cudf::size_type> slice_indices{0, 2, 2, 3, 3, 7};
-  auto sliced_strings  = cudf::slice(strings, slice_indices);
+  auto sliced_strings  = cudf::slice(input, slice_indices);
   auto sliced_expected = cudf::slice(expected, slice_indices);
   for (size_t i = 0; i < sliced_strings.size(); ++i) {
     auto strings_view = cudf::strings_column_view(sliced_strings[i]);
@@ -127,10 +130,10 @@ TEST_F(StringsReplaceTest, ReplaceReplLimitInputSliced)
       cudf::strings::replace(strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, sliced_expected[i]);
     results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-      strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2);
+      strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2, stream, mr);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, sliced_expected[i]);
     results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-      strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2);
+      strings_view, cudf::string_scalar(" "), cudf::string_scalar("--"), 2, stream, mr);
     CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, sliced_expected[i]);
   }
 }
@@ -140,9 +143,9 @@ TEST_F(StringsReplaceTest, ReplaceTargetOverlap)
   auto corpus      = build_corpus();
   auto corpus_view = cudf::strings_column_view(corpus);
   // replace all occurrences of 'the ' with '+++++++ '
-  auto strings = cudf::strings::replace(
+  auto input = cudf::strings::replace(
     corpus_view, cudf::string_scalar("the "), cudf::string_scalar("++++++++ "));
-  auto strings_view = cudf::strings_column_view(*strings);
+  auto strings_view = cudf::strings_column_view(*input);
   // replace all occurrences of '+++' with 'plus '
   std::vector<const char*> h_expected{
     "plus plus ++ quick brown fox jumps over plus plus ++ lazy dog",
@@ -153,60 +156,71 @@ TEST_F(StringsReplaceTest, ReplaceTargetOverlap)
     "",
     nullptr};
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
+
+  auto stream = cudf::get_default_stream();
+  auto mr     = rmm::mr::get_current_device_resource();
+
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "));
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
+
   results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "));
+    strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "), -1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
   results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "));
+    strings_view, cudf::string_scalar("+++"), cudf::string_scalar("plus "), -1, stream, mr);
   CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, expected);
 }
 
 TEST_F(StringsReplaceTest, ReplaceTargetOverlapsStrings)
 {
-  auto strings      = build_corpus();
-  auto strings_view = cudf::strings_column_view(strings);
+  auto input        = build_corpus();
+  auto strings_view = cudf::strings_column_view(input);
+  auto stream       = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+
   // replace all occurrences of 'dogthe' with '+'
   // should not replace anything unless it incorrectly matches across a string boundary
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
   results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+    strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"), -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
   results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+    strings_view, cudf::string_scalar("dogthe"), cudf::string_scalar("+"), -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
 }
 
 TEST_F(StringsReplaceTest, ReplaceNullInput)
 {
   std::vector<const char*> h_null_strings(128);
-  auto strings = cudf::test::strings_column_wrapper(
+  auto input = cudf::test::strings_column_wrapper(
     h_null_strings.begin(), h_null_strings.end(), thrust::make_constant_iterator(false));
-  auto strings_view = cudf::strings_column_view(strings);
+  auto strings_view = cudf::strings_column_view(input);
+  auto stream       = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
   // replace all occurrences of '+' with ''
   // should not replace anything as input is all null
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("+"), cudf::string_scalar(""));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
   results = cudf::strings::detail::replace<algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("+"), cudf::string_scalar(""));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+    strings_view, cudf::string_scalar("+"), cudf::string_scalar(""), -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
   results = cudf::strings::detail::replace<algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("+"), cudf::string_scalar(""));
-  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, strings);
+    strings_view, cudf::string_scalar("+"), cudf::string_scalar(""), -1, stream, mr);
+  CUDF_TEST_EXPECT_COLUMNS_EQUAL(*results, input);
 }
 
 TEST_F(StringsReplaceTest, ReplaceEndOfString)
 {
-  auto strings      = build_corpus();
-  auto strings_view = cudf::strings_column_view(strings);
+  auto input        = build_corpus();
+  auto strings_view = cudf::strings_column_view(input);
+  auto stream       = cudf::get_default_stream();
+  auto mr           = rmm::mr::get_current_device_resource();
+
   // replace all occurrences of 'in' with  ' '
   std::vector<const char*> h_expected{"the quick brown fox jumps over the lazy dog",
                                       "the fat cat lays next to the other accénted cat",
@@ -217,20 +231,18 @@ TEST_F(StringsReplaceTest, ReplaceEndOfString)
                                       nullptr};
 
   cudf::test::strings_column_wrapper expected(
-    h_expected.begin(),
-    h_expected.end(),
-    thrust::make_transform_iterator(h_expected.begin(), [](auto str) { return str != nullptr; }));
+    h_expected.begin(), h_expected.end(), cudf::test::iterators::nulls_from_nullptrs(h_expected));
 
   auto results =
     cudf::strings::replace(strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "));
   cudf::test::expect_columns_equal(*results, expected);
 
   results = cudf::strings::detail::replace<cudf::strings::detail::replace_algorithm::CHAR_PARALLEL>(
-    strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "));
+    strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "), -1, stream, mr);
   cudf::test::expect_columns_equal(*results, expected);
 
   results = cudf::strings::detail::replace<cudf::strings::detail::replace_algorithm::ROW_PARALLEL>(
-    strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "));
+    strings_view, cudf::string_scalar("in"), cudf::string_scalar(" "), -1, stream, mr);
   cudf::test::expect_columns_equal(*results, expected);
 }
 
@@ -348,5 +360,5 @@ TEST_F(StringsReplaceTest, EmptyStringsColumn)
   auto results      = cudf::strings::replace(
     strings_view, cudf::string_scalar("not"), cudf::string_scalar("pertinent"));
   auto view = results->view();
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
diff --git a/cpp/tests/strings/split_tests.cpp b/cpp/tests/strings/split_tests.cpp
index c7bbce263f3..73d5adab427 100644
--- a/cpp/tests/strings/split_tests.cpp
+++ b/cpp/tests/strings/split_tests.cpp
@@ -14,20 +14,21 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+#include <cudf_test/iterator_utilities.hpp>
+#include <cudf_test/table_utilities.hpp>
+
 #include <cudf/column/column_factories.hpp>
 #include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/regex/regex_program.hpp>
 #include <cudf/strings/split/partition.hpp>
 #include <cudf/strings/split/split.hpp>
 #include <cudf/strings/split/split_re.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/table/table.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <cudf_test/iterator_utilities.hpp>
-#include <cudf_test/table_utilities.hpp>
-
 #include <thrust/iterator/transform_iterator.h>
 
 #include <vector>
@@ -316,21 +317,28 @@ TEST_F(StringsSplitTest, SplitRegex)
   auto sv = cudf::strings_column_view(input);
 
   {
-    auto result = cudf::strings::split_re(sv, "\\s+");
+    auto pattern = std::string("\\s+");
+    auto result  = cudf::strings::split_re(sv, pattern);
 
     cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, validity);
     cudf::test::strings_column_wrapper col1({"Héllo", "", "some", "String", ""}, {1, 0, 1, 1, 0});
     cudf::test::strings_column_wrapper col2({"thesé", "", "", "", ""}, {1, 0, 1, 0, 0});
     auto expected = cudf::table_view({col0, col1, col2});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_re(sv, *prog);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
     // rsplit == split when using default parameters
-    result = cudf::strings::rsplit_re(sv, "\\s+");
+    result = cudf::strings::rsplit_re(sv, pattern);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_re(sv, *prog);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
 
   {
-    auto result = cudf::strings::split_re(sv, "[eé]");
+    auto pattern = std::string("[eé]");
+    auto result  = cudf::strings::split_re(sv, pattern);
 
     cudf::test::strings_column_wrapper col0({" H", "", "ar", "t", ""}, validity);
     cudf::test::strings_column_wrapper col1({"llo th", "", " som", "st String", ""},
@@ -339,9 +347,14 @@ TEST_F(StringsSplitTest, SplitRegex)
     cudf::test::strings_column_wrapper col3({"", "", "", "", ""}, {1, 0, 0, 0, 0});
     auto expected = cudf::table_view({col0, col1, col2, col3});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_re(sv, *prog);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
     // rsplit == split when using default parameters
-    result = cudf::strings::rsplit_re(sv, "[eé]");
+    result = cudf::strings::rsplit_re(sv, pattern);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_re(sv, *prog);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
 }
@@ -356,20 +369,27 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
 
   using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
   {
-    auto result = cudf::strings::split_record_re(sv, "\\s+");
+    auto pattern = std::string("\\s+");
+    auto result  = cudf::strings::split_record_re(sv, pattern);
 
     LCW expected(
       {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", ""}, LCW{"tést", "String"}, LCW{""}},
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_record_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
     // rsplit == split when using default parameters
-    result = cudf::strings::rsplit_record_re(sv, "\\s+");
+    result = cudf::strings::rsplit_record_re(sv, pattern);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_record_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
   }
 
   {
-    auto result = cudf::strings::split_record_re(sv, "[eé]");
+    auto pattern = std::string("[eé]");
+    auto result  = cudf::strings::split_record_re(sv, pattern);
 
     LCW expected({LCW{" H", "llo th", "s", ""},
                   LCW{},
@@ -378,9 +398,14 @@ TEST_F(StringsSplitTest, SplitRecordRegex)
                   LCW{""}},
                  validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_record_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
     // rsplit == split when using default parameters
-    result = cudf::strings::rsplit_record_re(sv, "[eé]");
+    result = cudf::strings::rsplit_record_re(sv, pattern);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_record_re(sv, *prog);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
   }
 }
@@ -393,37 +418,51 @@ TEST_F(StringsSplitTest, SplitRegexWithMaxSplit)
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
   auto sv = cudf::strings_column_view(input);
   {
-    auto result = cudf::strings::split_re(sv, "\\s+", 1);
+    auto pattern = std::string("\\s+");
+    auto result  = cudf::strings::split_re(sv, pattern, 1);
 
     cudf::test::strings_column_wrapper col0({"", "", "are", "tést", ""}, {1, 0, 1, 1, 1});
     cudf::test::strings_column_wrapper col1({"Héllo\tthesé", "", "some  ", "String", ""},
                                             {1, 0, 1, 1, 0});
     auto expected = cudf::table_view({col0, col1});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_re(sv, *prog, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
 
     // split everything is the same output as maxsplit==2 for the test input column here
-    result         = cudf::strings::split_re(sv, "\\s+", 2);
-    auto expected2 = cudf::strings::split_re(sv, "\\s+");
+    result         = cudf::strings::split_re(sv, pattern, 2);
+    auto expected2 = cudf::strings::split_re(sv, pattern);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view());
+    result = cudf::strings::split_re(sv, *prog, 3);
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected2->view());
   }
   {
-    auto result = cudf::strings::split_record_re(sv, "\\s", 1);
+    auto pattern = std::string("\\s");
+    auto result  = cudf::strings::split_record_re(sv, pattern, 1);
 
     using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
     LCW expected1(
       {LCW{"", "Héllo\tthesé"}, LCW{}, LCW{"are", "some  "}, LCW{"tést", "String"}, LCW{""}},
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_record_re(sv, *prog, 1);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected1);
 
-    result = cudf::strings::split_record_re(sv, "\\s", 2);
+    result = cudf::strings::split_record_re(sv, pattern, 2);
     LCW expected2(
       {LCW{"", "Héllo", "thesé"}, LCW{}, LCW{"are", "some", " "}, LCW{"tést", "String"}, LCW{""}},
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2);
+    result = cudf::strings::split_record_re(sv, *prog, 2);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected2);
 
     // split everything is the same output as maxsplit==3 for the test input column here
-    result         = cudf::strings::split_record_re(sv, "\\s", 3);
-    auto expected0 = cudf::strings::split_record_re(sv, "\\s");
+    result         = cudf::strings::split_record_re(sv, pattern, 3);
+    auto expected0 = cudf::strings::split_record_re(sv, pattern);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
+    result = cudf::strings::split_record_re(sv, *prog, 3);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
   }
 }
@@ -433,7 +472,8 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
   cudf::test::strings_column_wrapper input({"a", "ab", "-+", "e\né"});
   auto sv = cudf::strings_column_view(input);
   {
-    auto result = cudf::strings::split_re(sv, "\\b");
+    auto pattern = std::string("\\b");
+    auto result  = cudf::strings::split_re(sv, pattern);
 
     cudf::test::strings_column_wrapper col0({"", "", "-+", ""});
     cudf::test::strings_column_wrapper col1({"a", "ab", "", "e"}, {1, 1, 0, 1});
@@ -442,13 +482,20 @@ TEST_F(StringsSplitTest, SplitRegexWordBoundary)
     cudf::test::strings_column_wrapper col4({"", "", "", ""}, {0, 0, 0, 1});
     auto expected = cudf::table_view({col0, col1, col2, col3, col4});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_re(sv, *prog);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
   {
-    auto result = cudf::strings::split_record_re(sv, "\\B");
+    auto pattern = std::string("\\B");
+    auto result  = cudf::strings::split_record_re(sv, pattern);
 
     using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
     LCW expected({LCW{"a"}, LCW{"a", "b"}, LCW{"", "-", "+", ""}, LCW{"e\né"}});
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    auto prog = cudf::strings::regex_program::create(pattern);
+    result    = cudf::strings::split_record_re(sv, *prog);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
   }
 }
 
@@ -551,26 +598,35 @@ TEST_F(StringsSplitTest, RSplitRegexWithMaxSplit)
   cudf::test::strings_column_wrapper input(h_strings.begin(), h_strings.end(), validity);
   auto sv = cudf::strings_column_view(input);
 
+  auto pattern = std::string("\\s+");
+  auto prog    = cudf::strings::regex_program::create(pattern);
+
   {
-    auto result = cudf::strings::rsplit_re(sv, "\\s+", 1);
+    auto result = cudf::strings::rsplit_re(sv, pattern, 1);
 
     cudf::test::strings_column_wrapper col0({" Héllo", "", "are some", "tést", ""}, validity);
     cudf::test::strings_column_wrapper col1({"thesé", "", "", "String", ""}, {1, 0, 1, 1, 0});
     auto expected = cudf::table_view({col0, col1});
     CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_re(sv, *prog, 1);
+    CUDF_TEST_EXPECT_TABLES_EQUIVALENT(result->view(), expected);
   }
   {
-    auto result = cudf::strings::rsplit_record_re(sv, "\\s+", 1);
+    auto result = cudf::strings::rsplit_record_re(sv, pattern, 1);
 
     using LCW = cudf::test::lists_column_wrapper<cudf::string_view>;
     LCW expected(
       {LCW{" Héllo", "thesé"}, LCW{}, LCW{"are some", ""}, LCW{"tést", "String"}, LCW{""}},
       validity);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
+    result = cudf::strings::rsplit_record_re(sv, *prog, 1);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected);
 
     // split everything is the same output as any maxsplit > 2 for the test input column here
-    result         = cudf::strings::rsplit_record_re(sv, "\\s+", 3);
-    auto expected0 = cudf::strings::rsplit_record_re(sv, "\\s+");
+    result         = cudf::strings::rsplit_record_re(sv, pattern, 3);
+    auto expected0 = cudf::strings::rsplit_record_re(sv, pattern);
+    CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
+    result = cudf::strings::rsplit_record_re(sv, *prog, 3);
     CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(result->view(), expected0->view());
   }
 }
diff --git a/cpp/tests/strings/strip_tests.cpp b/cpp/tests/strings/strip_tests.cpp
index 6916b990762..f7044b48e40 100644
--- a/cpp/tests/strings/strip_tests.cpp
+++ b/cpp/tests/strings/strip_tests.cpp
@@ -14,15 +14,14 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
-#include <cudf/strings/strings_column_view.hpp>
-#include <cudf/strings/strip.hpp>
-
-#include "./utilities.h"
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
 
+#include <cudf/column/column.hpp>
+#include <cudf/strings/strings_column_view.hpp>
+#include <cudf/strings/strip.hpp>
+
 #include <thrust/iterator/transform_iterator.h>
 
 #include <vector>
@@ -99,7 +98,7 @@ TEST_F(StringsStripTest, EmptyStringsColumn)
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   auto results      = cudf::strings::strip(strings_view);
   auto view         = results->view();
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsStripTest, InvalidParameter)
diff --git a/cpp/tests/strings/substring_tests.cpp b/cpp/tests/strings/substring_tests.cpp
index 1a90dc5fe38..e8e2d936d12 100644
--- a/cpp/tests/strings/substring_tests.cpp
+++ b/cpp/tests/strings/substring_tests.cpp
@@ -14,16 +14,15 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column_view.hpp>
 #include <cudf/scalar/scalar.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/substring.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
-
 #include <thrust/host_vector.h>
 #include <thrust/iterator/transform_iterator.h>
 #include <thrust/sequence.h>
@@ -283,18 +282,18 @@ TEST_F(StringsSubstringsTest, ZeroSizeStringsColumn)
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
 
   auto results = cudf::strings::slice_strings(strings_view, 1, 2);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 
   results = cudf::strings::slice_strings(strings_view, cudf::string_scalar("foo"), 1);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 
   cudf::column_view starts_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
   cudf::column_view stops_column(cudf::data_type{cudf::type_id::INT32}, 0, nullptr, nullptr, 0);
   results = cudf::strings::slice_strings(strings_view, starts_column, stops_column);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 
   results = cudf::strings::slice_strings(strings_view, strings_view, 1);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsSubstringsTest, AllEmpty)
diff --git a/cpp/tests/strings/translate_tests.cpp b/cpp/tests/strings/translate_tests.cpp
index 53c6982b880..1e278caa366 100644
--- a/cpp/tests/strings/translate_tests.cpp
+++ b/cpp/tests/strings/translate_tests.cpp
@@ -14,17 +14,16 @@
  * limitations under the License.
  */
 
+#include <cudf_test/base_fixture.hpp>
+#include <cudf_test/column_utilities.hpp>
+#include <cudf_test/column_wrapper.hpp>
+
 #include <cudf/column/column.hpp>
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/strings_column_view.hpp>
 #include <cudf/strings/translate.hpp>
 #include <cudf/utilities/error.hpp>
 
-#include <cudf_test/base_fixture.hpp>
-#include <cudf_test/column_utilities.hpp>
-#include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
-
 #include <thrust/iterator/transform_iterator.h>
 
 #include <vector>
@@ -69,9 +68,9 @@ TEST_F(StringsTranslateTest, ZeroSizeStringsColumn)
   auto strings_view = cudf::strings_column_view(zero_size_strings_column);
   std::vector<std::pair<cudf::char_utf8, cudf::char_utf8>> translate_table;
   auto results = cudf::strings::translate(strings_view, translate_table);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
   results = cudf::strings::filter_characters(strings_view, translate_table);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(StringsTranslateTest, FilterCharacters)
diff --git a/cpp/tests/strings/urls_tests.cpp b/cpp/tests/strings/urls_tests.cpp
index 95a51bbaaeb..9199d78cfb8 100644
--- a/cpp/tests/strings/urls_tests.cpp
+++ b/cpp/tests/strings/urls_tests.cpp
@@ -14,12 +14,12 @@
  * limitations under the License.
  */
 
-#include <cudf/strings/convert/convert_urls.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/strings/convert/convert_urls.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <thrust/iterator/transform_iterator.h>
 
@@ -230,7 +230,7 @@ TEST_F(StringsConvertTest, ZeroSizeUrlStringsColumn)
   cudf::column_view zero_size_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
   auto results = cudf::strings::url_encode(zero_size_column);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
   results = cudf::strings::url_decode(zero_size_column);
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
diff --git a/cpp/tests/strings/utilities.cpp b/cpp/tests/strings/utilities.cpp
deleted file mode 100644
index 1d7ec7cbecd..00000000000
--- a/cpp/tests/strings/utilities.cpp
+++ /dev/null
@@ -1,32 +0,0 @@
-/*
- * Copyright (c) 2019, NVIDIA CORPORATION.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-#include <tests/strings/utilities.h>
-
-#include <gmock/gmock.h>
-
-namespace cudf {
-namespace test {
-void expect_strings_empty(cudf::column_view strings_column)
-{
-  EXPECT_EQ(type_id::STRING, strings_column.type().id());
-  EXPECT_EQ(0, strings_column.size());
-  EXPECT_EQ(0, strings_column.null_count());
-  EXPECT_EQ(0, strings_column.num_children());
-}
-
-}  // namespace test
-}  // namespace cudf
diff --git a/cpp/tests/structs/utilities_tests.cpp b/cpp/tests/structs/utilities_tests.cpp
index d58568cd1b5..29f801eeaa4 100644
--- a/cpp/tests/structs/utilities_tests.cpp
+++ b/cpp/tests/structs/utilities_tests.cpp
@@ -28,90 +28,87 @@
 #include <cudf/detail/structs/utilities.hpp>
 #include <cudf/null_mask.hpp>
 
-namespace cudf::test {
-
-using namespace cudf;
-using namespace iterators;
-using namespace cudf::structs::detail;
-using strings    = strings_column_wrapper;
-using dictionary = dictionary_column_wrapper<std::string>;
-using structs    = structs_column_wrapper;
-using bools      = fixed_width_column_wrapper<bool>;
-
 template <typename T>
-using nums = fixed_width_column_wrapper<T, int32_t>;
+using nums = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 
 template <typename T>
-using lists = lists_column_wrapper<T, int32_t>;
+using lists = cudf::test::lists_column_wrapper<T, int32_t>;
 
-struct StructUtilitiesTest : BaseFixture {
+struct StructUtilitiesTest : cudf::test::BaseFixture {
 };
 
 template <typename T>
 struct TypedStructUtilitiesTest : StructUtilitiesTest {
 };
 
-TYPED_TEST_SUITE(TypedStructUtilitiesTest, FixedWidthTypes);
+TYPED_TEST_SUITE(TypedStructUtilitiesTest, cudf::test::FixedWidthTypes);
 
 TYPED_TEST(TypedStructUtilitiesTest, ListsAtTopLevel)
 {
   using T     = TypeParam;
-  using lists = lists_column_wrapper<T, int32_t>;
-  using nums  = fixed_width_column_wrapper<T, int32_t>;
+  using lists = cudf::test::lists_column_wrapper<T, int32_t>;
+  using nums  = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 
   auto lists_col = lists{{0, 1}, {22, 33}, {44, 55, 66}};
-  auto nums_col  = nums{{0, 1, 2}, null_at(6)};
+  auto nums_col  = nums{{0, 1, 2}, cudf::test::iterators::null_at(6)};
 
   auto table = cudf::table_view{{lists_col, nums_col}};
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(table,
-                                flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
+                                cudf::structs::detail::flatten_nested_columns(
+                                  table, {}, {}, cudf::structs::detail::column_nullability::FORCE));
 }
 
 TYPED_TEST(TypedStructUtilitiesTest, NestedListsUnsupported)
 {
   using T     = TypeParam;
-  using lists = lists_column_wrapper<T, int32_t>;
-  using nums  = fixed_width_column_wrapper<T, int32_t>;
+  using lists = cudf::test::lists_column_wrapper<T, int32_t>;
+  using nums  = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 
   auto lists_member = lists{{0, 1}, {22, 33}, {44, 55, 66}};
-  auto nums_member  = nums{{0, 1, 2}, null_at(6)};
-  auto structs_col  = structs{{nums_member, lists_member}};
-  auto nums_col     = nums{{0, 1, 2}, null_at(6)};
-
-  EXPECT_THROW(flatten_nested_columns(
-                 cudf::table_view{{nums_col, structs_col}}, {}, {}, column_nullability::FORCE),
-               cudf::logic_error);
+  auto nums_member  = nums{{0, 1, 2}, cudf::test::iterators::null_at(6)};
+  auto structs_col  = cudf::test::structs_column_wrapper{{nums_member, lists_member}};
+  auto nums_col     = nums{{0, 1, 2}, cudf::test::iterators::null_at(6)};
+
+  EXPECT_THROW(
+    cudf::structs::detail::flatten_nested_columns(cudf::table_view{{nums_col, structs_col}},
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE),
+    cudf::logic_error);
 }
 
 TYPED_TEST(TypedStructUtilitiesTest, NoStructs)
 {
   using T    = TypeParam;
-  using nums = fixed_width_column_wrapper<T, int32_t>;
+  using nums = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 
-  auto nums_col        = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
-  auto strings_col     = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
-  auto nuther_nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+  auto nums_col    = nums{{0, 1, 22, 33, 44, 55, 66}, cudf::test::iterators::null_at(0)};
+  auto strings_col = cudf::test::strings_column_wrapper{
+    {"", "1", "22", "333", "4444", "55555", "666666"}, cudf::test::iterators::null_at(1)};
+  auto nuther_nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, cudf::test::iterators::null_at(6)};
 
   auto table = cudf::table_view{{nums_col, strings_col, nuther_nums_col}};
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(table,
-                                flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
+                                cudf::structs::detail::flatten_nested_columns(
+                                  table, {}, {}, cudf::structs::detail::column_nullability::FORCE));
 }
 
 TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct)
 {
   using T    = TypeParam;
-  using nums = fixed_width_column_wrapper<T, int32_t>;
+  using nums = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 
-  auto nums_member    = nums{{0, 1, 22, 333, 44, 55, 66}, null_at(0)};
-  auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
-  auto structs_col    = structs{{nums_member, strings_member}};
-  auto nums_col       = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
-  auto table          = cudf::table_view{{nums_col, structs_col}};
+  auto nums_member    = nums{{0, 1, 22, 333, 44, 55, 66}, cudf::test::iterators::null_at(0)};
+  auto strings_member = cudf::test::strings_column_wrapper{
+    {"", "1", "22", "333", "4444", "55555", "666666"}, cudf::test::iterators::null_at(1)};
+  auto structs_col = cudf::test::structs_column_wrapper{{nums_member, strings_member}};
+  auto nums_col    = nums{{0, 1, 2, 3, 4, 5, 6}, cudf::test::iterators::null_at(6)};
+  auto table       = cudf::table_view{{nums_col, structs_col}};
 
   auto expected_nums_col_1  = cudf::column(nums_col);
-  auto expected_structs_col = bools{{1, 1, 1, 1, 1, 1, 1}};
+  auto expected_structs_col = cudf::test::fixed_width_column_wrapper<bool>{{1, 1, 1, 1, 1, 1, 1}};
   auto expected_nums_col_2 =
     cudf::column(static_cast<cudf::structs_column_view>(structs_col).get_sliced_child(0));
   auto expected_strings_col =
@@ -120,22 +117,26 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStruct)
     {expected_nums_col_1, expected_structs_col, expected_nums_col_2, expected_strings_col}};
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
-                                flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
+                                cudf::structs::detail::flatten_nested_columns(
+                                  table, {}, {}, cudf::structs::detail::column_nullability::FORCE));
 }
 
 TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls)
 {
   using T    = TypeParam;
-  using nums = fixed_width_column_wrapper<T, int32_t>;
+  using nums = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 
-  auto nums_member    = nums{{0, 1, 22, 333, 44, 55, 66}, null_at(0)};
-  auto strings_member = strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
-  auto structs_col    = structs{{nums_member, strings_member}, null_at(2)};
-  auto nums_col       = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
-  auto table          = cudf::table_view{{nums_col, structs_col}};
+  auto nums_member    = nums{{0, 1, 22, 333, 44, 55, 66}, cudf::test::iterators::null_at(0)};
+  auto strings_member = cudf::test::strings_column_wrapper{
+    {"", "1", "22", "333", "4444", "55555", "666666"}, cudf::test::iterators::null_at(1)};
+  auto structs_col = cudf::test::structs_column_wrapper{{nums_member, strings_member},
+                                                        cudf::test::iterators::null_at(2)};
+  auto nums_col    = nums{{0, 1, 2, 3, 4, 5, 6}, cudf::test::iterators::null_at(6)};
+  auto table       = cudf::table_view{{nums_col, structs_col}};
 
   auto expected_nums_col_1  = cudf::column(nums_col);
-  auto expected_structs_col = bools{{1, 1, 0, 1, 1, 1, 1}, null_at(2)};
+  auto expected_structs_col = cudf::test::fixed_width_column_wrapper<bool>{
+    {1, 1, 0, 1, 1, 1, 1}, cudf::test::iterators::null_at(2)};
   auto expected_nums_col_2 =
     cudf::column(static_cast<cudf::structs_column_view>(structs_col).get_sliced_child(0));
   auto expected_strings_col =
@@ -144,30 +145,33 @@ TYPED_TEST(TypedStructUtilitiesTest, SingleLevelStructWithNulls)
     {expected_nums_col_1, expected_structs_col, expected_nums_col_2, expected_strings_col}};
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
-                                flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
+                                cudf::structs::detail::flatten_nested_columns(
+                                  table, {}, {}, cudf::structs::detail::column_nullability::FORCE));
 }
 
 TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct)
 {
   using T    = TypeParam;
-  using nums = fixed_width_column_wrapper<T, int32_t>;
+  using nums = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 
-  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, cudf::test::iterators::null_at(6)};
 
-  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
-  auto struct_0_strings_member =
-    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
-  auto structs_1_structs_member = structs{{struct_0_nums_member, struct_0_strings_member}};
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, cudf::test::iterators::null_at(0)};
+  auto struct_0_strings_member = cudf::test::strings_column_wrapper{
+    {"", "1", "22", "333", "4444", "55555", "666666"}, cudf::test::iterators::null_at(1)};
+  auto structs_1_structs_member =
+    cudf::test::structs_column_wrapper{{struct_0_nums_member, struct_0_strings_member}};
 
-  auto struct_1_nums_member  = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
-  auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}};
-  auto table                 = cudf::table_view{{nums_col, struct_of_structs_col}};
+  auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, cudf::test::iterators::null_at(3)};
+  auto struct_of_structs_col =
+    cudf::test::structs_column_wrapper{{struct_1_nums_member, structs_1_structs_member}};
+  auto table = cudf::table_view{{nums_col, struct_of_structs_col}};
 
   auto expected_nums_col_1    = cudf::column(nums_col);
-  auto expected_structs_col_1 = bools{{1, 1, 1, 1, 1, 1, 1}};
+  auto expected_structs_col_1 = cudf::test::fixed_width_column_wrapper<bool>{{1, 1, 1, 1, 1, 1, 1}};
   auto expected_nums_col_2 =
     cudf::column(static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(0));
-  auto expected_structs_col_2 = bools{{1, 1, 1, 1, 1, 1, 1}};
+  auto expected_structs_col_2 = cudf::test::fixed_width_column_wrapper<bool>{{1, 1, 1, 1, 1, 1, 1}};
   auto expected_nums_col_3    = cudf::column(
     static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(1).child(0));
   auto expected_strings_col = cudf::column(
@@ -180,32 +184,35 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStruct)
                                     expected_strings_col}};
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
-                                flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
+                                cudf::structs::detail::flatten_nested_columns(
+                                  table, {}, {}, cudf::structs::detail::column_nullability::FORCE));
 }
 
 TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel)
 {
   using T    = TypeParam;
-  using nums = fixed_width_column_wrapper<T, int32_t>;
+  using nums = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 
-  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, cudf::test::iterators::null_at(6)};
 
-  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
-  auto struct_0_strings_member =
-    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
-  auto structs_1_structs_member =
-    structs{{struct_0_nums_member, struct_0_strings_member}, null_at(2)};
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, cudf::test::iterators::null_at(0)};
+  auto struct_0_strings_member = cudf::test::strings_column_wrapper{
+    {"", "1", "22", "333", "4444", "55555", "666666"}, cudf::test::iterators::null_at(1)};
+  auto structs_1_structs_member = cudf::test::structs_column_wrapper{
+    {struct_0_nums_member, struct_0_strings_member}, cudf::test::iterators::null_at(2)};
 
-  auto struct_1_nums_member  = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
-  auto struct_of_structs_col = structs{{struct_1_nums_member, structs_1_structs_member}};
-  auto table                 = cudf::table_view{{nums_col, struct_of_structs_col}};
+  auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, cudf::test::iterators::null_at(3)};
+  auto struct_of_structs_col =
+    cudf::test::structs_column_wrapper{{struct_1_nums_member, structs_1_structs_member}};
+  auto table = cudf::table_view{{nums_col, struct_of_structs_col}};
 
   auto expected_nums_col_1    = cudf::column(nums_col);
-  auto expected_structs_col_1 = bools{{1, 1, 1, 1, 1, 1, 1}};
+  auto expected_structs_col_1 = cudf::test::fixed_width_column_wrapper<bool>{{1, 1, 1, 1, 1, 1, 1}};
   auto expected_nums_col_2 =
     cudf::column(static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(0));
-  auto expected_structs_col_2 = bools{{1, 1, 0, 1, 1, 1, 1}, null_at(2)};
-  auto expected_nums_col_3    = cudf::column(
+  auto expected_structs_col_2 = cudf::test::fixed_width_column_wrapper<bool>{
+    {1, 1, 0, 1, 1, 1, 1}, cudf::test::iterators::null_at(2)};
+  auto expected_nums_col_3 = cudf::column(
     static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(1).child(0));
   auto expected_strings_col = cudf::column(
     static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(1).child(1));
@@ -217,32 +224,36 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtLeafLevel)
                                     expected_strings_col}};
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
-                                flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
+                                cudf::structs::detail::flatten_nested_columns(
+                                  table, {}, {}, cudf::structs::detail::column_nullability::FORCE));
 }
 
 TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel)
 {
   using T    = TypeParam;
-  using nums = fixed_width_column_wrapper<T, int32_t>;
+  using nums = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 
-  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, cudf::test::iterators::null_at(6)};
 
-  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
-  auto struct_0_strings_member =
-    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
-  auto structs_1_structs_member = structs{{struct_0_nums_member, struct_0_strings_member}};
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, cudf::test::iterators::null_at(0)};
+  auto struct_0_strings_member = cudf::test::strings_column_wrapper{
+    {"", "1", "22", "333", "4444", "55555", "666666"}, cudf::test::iterators::null_at(1)};
+  auto structs_1_structs_member =
+    cudf::test::structs_column_wrapper{{struct_0_nums_member, struct_0_strings_member}};
 
-  auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
-  auto struct_of_structs_col =
-    structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)};
+  auto struct_1_nums_member  = nums{{0, 1, 22, 33, 44, 55, 66}, cudf::test::iterators::null_at(3)};
+  auto struct_of_structs_col = cudf::test::structs_column_wrapper{
+    {struct_1_nums_member, structs_1_structs_member}, cudf::test::iterators::null_at(4)};
   auto table = cudf::table_view{{nums_col, struct_of_structs_col}};
 
   auto expected_nums_col_1    = cudf::column(nums_col);
-  auto expected_structs_col_1 = bools{{1, 1, 1, 1, 0, 1, 1}, null_at(4)};
+  auto expected_structs_col_1 = cudf::test::fixed_width_column_wrapper<bool>{
+    {1, 1, 1, 1, 0, 1, 1}, cudf::test::iterators::null_at(4)};
   auto expected_nums_col_2 =
     cudf::column(static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(0));
-  auto expected_structs_col_2 = bools{{1, 1, 1, 1, 0, 1, 1}, null_at(4)};
-  auto expected_nums_col_3    = cudf::column(
+  auto expected_structs_col_2 = cudf::test::fixed_width_column_wrapper<bool>{
+    {1, 1, 1, 1, 0, 1, 1}, cudf::test::iterators::null_at(4)};
+  auto expected_nums_col_3 = cudf::column(
     static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(1).child(0));
   auto expected_strings_col = cudf::column(
     static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(1).child(1));
@@ -254,33 +265,36 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtTopLevel)
                                     expected_strings_col}};
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
-                                flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
+                                cudf::structs::detail::flatten_nested_columns(
+                                  table, {}, {}, cudf::structs::detail::column_nullability::FORCE));
 }
 
 TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
 {
   using T    = TypeParam;
-  using nums = fixed_width_column_wrapper<T, int32_t>;
+  using nums = cudf::test::fixed_width_column_wrapper<T, int32_t>;
 
-  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, null_at(6)};
+  auto nums_col = nums{{0, 1, 2, 3, 4, 5, 6}, cudf::test::iterators::null_at(6)};
 
-  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(0)};
-  auto struct_0_strings_member =
-    strings{{"", "1", "22", "333", "4444", "55555", "666666"}, null_at(1)};
-  auto structs_1_structs_member =
-    structs{{struct_0_nums_member, struct_0_strings_member}, null_at(2)};
+  auto struct_0_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, cudf::test::iterators::null_at(0)};
+  auto struct_0_strings_member = cudf::test::strings_column_wrapper{
+    {"", "1", "22", "333", "4444", "55555", "666666"}, cudf::test::iterators::null_at(1)};
+  auto structs_1_structs_member = cudf::test::structs_column_wrapper{
+    {struct_0_nums_member, struct_0_strings_member}, cudf::test::iterators::null_at(2)};
 
-  auto struct_1_nums_member = nums{{0, 1, 22, 33, 44, 55, 66}, null_at(3)};
-  auto struct_of_structs_col =
-    structs{{struct_1_nums_member, structs_1_structs_member}, null_at(4)};
+  auto struct_1_nums_member  = nums{{0, 1, 22, 33, 44, 55, 66}, cudf::test::iterators::null_at(3)};
+  auto struct_of_structs_col = cudf::test::structs_column_wrapper{
+    {struct_1_nums_member, structs_1_structs_member}, cudf::test::iterators::null_at(4)};
   auto table = cudf::table_view{{nums_col, struct_of_structs_col}};
 
   auto expected_nums_col_1    = cudf::column(nums_col);
-  auto expected_structs_col_1 = bools{{1, 1, 1, 1, 0, 1, 1}, null_at(4)};
+  auto expected_structs_col_1 = cudf::test::fixed_width_column_wrapper<bool>{
+    {1, 1, 1, 1, 0, 1, 1}, cudf::test::iterators::null_at(4)};
   auto expected_nums_col_2 =
     cudf::column(static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(0));
-  auto expected_structs_col_2 = bools{{1, 1, 0, 1, 0, 1, 1}, {1, 1, 0, 1, 0, 1, 1}};
-  auto expected_nums_col_3    = cudf::column(
+  auto expected_structs_col_2 =
+    cudf::test::fixed_width_column_wrapper<bool>{{1, 1, 0, 1, 0, 1, 1}, {1, 1, 0, 1, 0, 1, 1}};
+  auto expected_nums_col_3 = cudf::column(
     static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(1).child(0));
   auto expected_strings_col = cudf::column(
     static_cast<cudf::structs_column_view>(struct_of_structs_col).get_sliced_child(1).child(1));
@@ -292,25 +306,29 @@ TYPED_TEST(TypedStructUtilitiesTest, StructOfStructWithNullsAtAllLevels)
                                     expected_strings_col}};
 
   CUDF_TEST_EXPECT_TABLES_EQUAL(expected,
-                                flatten_nested_columns(table, {}, {}, column_nullability::FORCE));
+                                cudf::structs::detail::flatten_nested_columns(
+                                  table, {}, {}, cudf::structs::detail::column_nullability::FORCE));
 }
 
 TYPED_TEST(TypedStructUtilitiesTest, ListsAreUnsupported)
 {
   using T    = TypeParam;
-  using ints = fixed_width_column_wrapper<int32_t>;
-  using lcw  = lists_column_wrapper<T, int32_t>;
+  using ints = cudf::test::fixed_width_column_wrapper<int32_t>;
+  using lcw  = cudf::test::lists_column_wrapper<T, int32_t>;
 
   // clang-format off
   auto lists_member = lcw{  {0,1,2}, {3,4,5}, {6,7,8,9} };
   auto ints_member  = ints{       0,       1,         2 };
   // clang-format on
 
-  auto structs_with_lists_col = structs{lists_member, ints_member};
+  auto structs_with_lists_col = cudf::test::structs_column_wrapper{lists_member, ints_member};
 
-  EXPECT_THROW(flatten_nested_columns(
-                 cudf::table_view{{structs_with_lists_col}}, {}, {}, column_nullability::FORCE),
-               cudf::logic_error);
+  EXPECT_THROW(
+    cudf::structs::detail::flatten_nested_columns(cudf::table_view{{structs_with_lists_col}},
+                                                  {},
+                                                  {},
+                                                  cudf::structs::detail::column_nullability::FORCE),
+    cudf::logic_error);
 }
 
 struct SuperimposeTest : StructUtilitiesTest {
@@ -320,13 +338,13 @@ template <typename T>
 struct TypedSuperimposeTest : StructUtilitiesTest {
 };
 
-TYPED_TEST_SUITE(TypedSuperimposeTest, FixedWidthTypes);
+TYPED_TEST_SUITE(TypedSuperimposeTest, cudf::test::FixedWidthTypes);
 
 void test_non_struct_columns(cudf::column_view const& input)
 {
   // superimpose_parent_nulls() on non-struct columns should return the input column, unchanged.
   auto [superimposed, backing_validity_buffers] =
-    cudf::structs::detail::superimpose_parent_nulls(input);
+    cudf::structs::detail::superimpose_parent_nulls(input, cudf::get_default_stream());
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(input, superimposed);
   EXPECT_TRUE(backing_validity_buffers.empty());
@@ -336,18 +354,21 @@ TYPED_TEST(TypedSuperimposeTest, NoStructInput)
 {
   using T = TypeParam;
 
-  test_non_struct_columns(fixed_width_column_wrapper<T>{{6, 5, 4, 3, 2, 1, 0}, null_at(3)});
-  test_non_struct_columns(
-    lists_column_wrapper<T, int32_t>{{{6, 5}, {4, 3}, {2, 1}, {0}}, null_at(3)});
-  test_non_struct_columns(strings{{"All", "The", "Leaves", "Are", "Brown"}, null_at(3)});
-  test_non_struct_columns(dictionary{{"All", "The", "Leaves", "Are", "Brown"}, null_at(3)});
+  test_non_struct_columns(cudf::test::fixed_width_column_wrapper<T>{
+    {6, 5, 4, 3, 2, 1, 0}, cudf::test::iterators::null_at(3)});
+  test_non_struct_columns(cudf::test::lists_column_wrapper<T, int32_t>{
+    {{6, 5}, {4, 3}, {2, 1}, {0}}, cudf::test::iterators::null_at(3)});
+  test_non_struct_columns(cudf::test::strings_column_wrapper{
+    {"All", "The", "Leaves", "Are", "Brown"}, cudf::test::iterators::null_at(3)});
+  test_non_struct_columns(cudf::test::dictionary_column_wrapper<std::string>{
+    {"All", "The", "Leaves", "Are", "Brown"}, cudf::test::iterators::null_at(3)});
 }
 
 /**
  * @brief Helper to construct a numeric member of a struct column.
  */
 template <typename T, typename NullIter>
-nums<T> make_nums_member(NullIter null_iter = no_nulls())
+nums<T> make_nums_member(NullIter null_iter = cudf::test::iterators::no_nulls())
 {
   return nums<T>{{10, 11, 12, 13, 14, 15, 16}, null_iter};
 }
@@ -356,7 +377,7 @@ nums<T> make_nums_member(NullIter null_iter = no_nulls())
  * @brief Helper to construct a lists member of a struct column.
  */
 template <typename T, typename NullIter>
-lists<T> make_lists_member(NullIter null_iter = no_nulls())
+lists<T> make_lists_member(NullIter null_iter = cudf::test::iterators::no_nulls())
 {
   return lists<T>{{{20, 20}, {21, 21}, {22, 22}, {23, 23}, {24, 24}, {25, 25}, {26, 26}},
                   null_iter};
@@ -366,27 +387,32 @@ TYPED_TEST(TypedSuperimposeTest, BasicStruct)
 {
   using T = TypeParam;
 
-  auto nums_member   = make_nums_member<T>(nulls_at({3, 6}));
-  auto lists_member  = make_lists_member<T>(nulls_at({4, 5}));
-  auto structs_input = structs{{nums_member, lists_member}, no_nulls()}.release();
+  auto nums_member   = make_nums_member<T>(cudf::test::iterators::nulls_at({3, 6}));
+  auto lists_member  = make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5}));
+  auto structs_input = cudf::test::structs_column_wrapper{{nums_member, lists_member},
+                                                          cudf::test::iterators::no_nulls()}
+                         .release();
 
   // Reset STRUCTs' null-mask. Mark first STRUCT row as null.
   auto structs_view = structs_input->mutable_view();
-  cudf::detail::set_null_mask(structs_view.null_mask(), 0, 1, false);
+  cudf::detail::set_null_mask(structs_view.null_mask(), 0, 1, false, cudf::get_default_stream());
 
   // At this point, the STRUCT nulls aren't pushed down to members,
   // even though the parent null-mask was modified.
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs_view.child(0), make_nums_member<T>(nulls_at({3, 6})));
-  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs_view.child(1),
-                                      make_lists_member<T>(nulls_at({4, 5})));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(structs_view.child(0),
+                                      make_nums_member<T>(cudf::test::iterators::nulls_at({3, 6})));
+  CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(
+    structs_view.child(1), make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5})));
 
-  auto [output, backing_buffers] = cudf::structs::detail::superimpose_parent_nulls(structs_view);
+  auto [output, backing_buffers] =
+    cudf::structs::detail::superimpose_parent_nulls(structs_view, cudf::get_default_stream());
 
   // After superimpose_parent_nulls(), the struct nulls (i.e. at index-0) should have been pushed
   // down to the children. All members should have nulls at row-index 0.
-  auto expected_nums_member    = make_nums_member<T>(nulls_at({0, 3, 6}));
-  auto expected_lists_member   = make_lists_member<T>(nulls_at({0, 4, 5}));
-  auto expected_structs_output = structs{{expected_nums_member, expected_lists_member}, null_at(0)};
+  auto expected_nums_member    = make_nums_member<T>(cudf::test::iterators::nulls_at({0, 3, 6}));
+  auto expected_lists_member   = make_lists_member<T>(cudf::test::iterators::nulls_at({0, 4, 5}));
+  auto expected_structs_output = cudf::test::structs_column_wrapper{
+    {expected_nums_member, expected_lists_member}, cudf::test::iterators::null_at(0)};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output, expected_structs_output);
 }
@@ -398,18 +424,21 @@ TYPED_TEST(TypedSuperimposeTest, NonNullableParentStruct)
 
   using T = TypeParam;
 
-  auto nums_member   = make_nums_member<T>(nulls_at({3, 6}));
-  auto lists_member  = make_lists_member<T>(nulls_at({4, 5}));
-  auto structs_input = structs{{nums_member, lists_member}, no_nulls()}.release();
+  auto nums_member   = make_nums_member<T>(cudf::test::iterators::nulls_at({3, 6}));
+  auto lists_member  = make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5}));
+  auto structs_input = cudf::test::structs_column_wrapper{{nums_member, lists_member},
+                                                          cudf::test::iterators::no_nulls()}
+                         .release();
 
-  auto [output, backing_buffers] =
-    cudf::structs::detail::superimpose_parent_nulls(structs_input->view());
+  auto [output, backing_buffers] = cudf::structs::detail::superimpose_parent_nulls(
+    structs_input->view(), cudf::get_default_stream());
 
   // After superimpose_parent_nulls(), none of the child structs should have changed,
   // because the parent had no nulls to begin with.
-  auto expected_nums_member    = make_nums_member<T>(nulls_at({3, 6}));
-  auto expected_lists_member   = make_lists_member<T>(nulls_at({4, 5}));
-  auto expected_structs_output = structs{{expected_nums_member, expected_lists_member}, no_nulls()};
+  auto expected_nums_member    = make_nums_member<T>(cudf::test::iterators::nulls_at({3, 6}));
+  auto expected_lists_member   = make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5}));
+  auto expected_structs_output = cudf::test::structs_column_wrapper{
+    {expected_nums_member, expected_lists_member}, cudf::test::iterators::no_nulls()};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output, expected_structs_output);
 }
@@ -422,26 +451,30 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_ChildNullable_ParentNonNullable)
 
   using T = TypeParam;
 
-  auto nums_member          = make_nums_member<T>(nulls_at({3, 6}));
-  auto lists_member         = make_lists_member<T>(nulls_at({4, 5}));
+  auto nums_member          = make_nums_member<T>(cudf::test::iterators::nulls_at({3, 6}));
+  auto lists_member         = make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5}));
   auto outer_struct_members = std::vector<std::unique_ptr<cudf::column>>{};
-  outer_struct_members.push_back(structs{{nums_member, lists_member}, no_nulls()}.release());
+  outer_struct_members.push_back(cudf::test::structs_column_wrapper{
+    {nums_member, lists_member}, cudf::test::iterators::no_nulls()}
+                                   .release());
 
   // Reset STRUCTs' null-mask. Mark first STRUCT row as null.
   auto structs_view = outer_struct_members.back()->mutable_view();
-  cudf::detail::set_null_mask(structs_view.null_mask(), 0, 1, false);
+  cudf::detail::set_null_mask(structs_view.null_mask(), 0, 1, false, cudf::get_default_stream());
 
-  auto structs_of_structs = structs{std::move(outer_struct_members)}.release();
+  auto structs_of_structs =
+    cudf::test::structs_column_wrapper{std::move(outer_struct_members)}.release();
 
-  auto [output, backing_buffers] =
-    cudf::structs::detail::superimpose_parent_nulls(structs_of_structs->view());
+  auto [output, backing_buffers] = cudf::structs::detail::superimpose_parent_nulls(
+    structs_of_structs->view(), cudf::get_default_stream());
 
   // After superimpose_parent_nulls(), outer-struct column should not have pushed nulls to child
   // structs. But the child struct column must push its nulls to its own children.
-  auto expected_nums_member  = make_nums_member<T>(nulls_at({0, 3, 6}));
-  auto expected_lists_member = make_lists_member<T>(nulls_at({0, 4, 5}));
-  auto expected_structs      = structs{{expected_nums_member, expected_lists_member}, null_at(0)};
-  auto expected_structs_of_structs = structs{{expected_structs}};
+  auto expected_nums_member  = make_nums_member<T>(cudf::test::iterators::nulls_at({0, 3, 6}));
+  auto expected_lists_member = make_lists_member<T>(cudf::test::iterators::nulls_at({0, 4, 5}));
+  auto expected_structs      = cudf::test::structs_column_wrapper{
+    {expected_nums_member, expected_lists_member}, cudf::test::iterators::null_at(0)};
+  auto expected_structs_of_structs = cudf::test::structs_column_wrapper{{expected_structs}};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output, expected_structs_of_structs);
 }
@@ -454,32 +487,38 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_ChildNullable_ParentNullable)
 
   using T = TypeParam;
 
-  auto nums_member          = make_nums_member<T>(nulls_at({3, 6}));
-  auto lists_member         = make_lists_member<T>(nulls_at({4, 5}));
+  auto nums_member          = make_nums_member<T>(cudf::test::iterators::nulls_at({3, 6}));
+  auto lists_member         = make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5}));
   auto outer_struct_members = std::vector<std::unique_ptr<cudf::column>>{};
-  outer_struct_members.push_back(structs{{nums_member, lists_member}, no_nulls()}.release());
+  outer_struct_members.push_back(cudf::test::structs_column_wrapper{
+    {nums_member, lists_member}, cudf::test::iterators::no_nulls()}
+                                   .release());
 
   // Reset STRUCTs' null-mask. Mark first STRUCT row as null.
   auto structs_view = outer_struct_members.back()->mutable_view();
   auto num_rows     = structs_view.size();
-  cudf::detail::set_null_mask(structs_view.null_mask(), 0, 1, false);
+  cudf::detail::set_null_mask(structs_view.null_mask(), 0, 1, false, cudf::get_default_stream());
 
-  auto structs_of_structs =
-    structs{std::move(outer_struct_members), std::vector<bool>(num_rows, true)}.release();
+  auto structs_of_structs = cudf::test::structs_column_wrapper{std::move(outer_struct_members),
+                                                               std::vector<bool>(num_rows, true)}
+                              .release();
 
   // Modify STRUCT-of-STRUCT's null-mask. Mark second STRUCT row as null.
   auto structs_of_structs_view = structs_of_structs->mutable_view();
-  cudf::detail::set_null_mask(structs_of_structs_view.null_mask(), 1, 2, false);
+  cudf::detail::set_null_mask(
+    structs_of_structs_view.null_mask(), 1, 2, false, cudf::get_default_stream());
 
-  auto [output, backing_buffers] =
-    cudf::structs::detail::superimpose_parent_nulls(structs_of_structs->view());
+  auto [output, backing_buffers] = cudf::structs::detail::superimpose_parent_nulls(
+    structs_of_structs->view(), cudf::get_default_stream());
 
   // After superimpose_parent_nulls(), outer-struct column should not have pushed nulls to child
   // structs. But the child struct column must push its nulls to its own children.
-  auto expected_nums_member  = make_nums_member<T>(nulls_at({0, 1, 3, 6}));
-  auto expected_lists_member = make_lists_member<T>(nulls_at({0, 1, 4, 5}));
-  auto expected_structs = structs{{expected_nums_member, expected_lists_member}, nulls_at({0, 1})};
-  auto expected_structs_of_structs = structs{{expected_structs}, null_at(1)};
+  auto expected_nums_member  = make_nums_member<T>(cudf::test::iterators::nulls_at({0, 1, 3, 6}));
+  auto expected_lists_member = make_lists_member<T>(cudf::test::iterators::nulls_at({0, 1, 4, 5}));
+  auto expected_structs      = cudf::test::structs_column_wrapper{
+    {expected_nums_member, expected_lists_member}, cudf::test::iterators::nulls_at({0, 1})};
+  auto expected_structs_of_structs =
+    cudf::test::structs_column_wrapper{{expected_structs}, cudf::test::iterators::null_at(1)};
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output, expected_structs_of_structs);
 }
@@ -489,9 +528,10 @@ cudf::column_view slice_off_first_and_last_rows(cudf::column_view const& col)
   return cudf::slice(col, {1, col.size() - 1})[0];
 }
 
-void mark_row_as_null(cudf::mutable_column_view const& col, size_type row_index)
+void mark_row_as_null(cudf::mutable_column_view const& col, cudf::size_type row_index)
 {
-  cudf::detail::set_null_mask(col.null_mask(), row_index, row_index + 1, false);
+  cudf::detail::set_null_mask(
+    col.null_mask(), row_index, row_index + 1, false, cudf::get_default_stream());
 }
 
 TYPED_TEST(TypedSuperimposeTest, Struct_Sliced)
@@ -502,9 +542,11 @@ TYPED_TEST(TypedSuperimposeTest, Struct_Sliced)
 
   using T = TypeParam;
 
-  auto nums_member    = make_nums_member<T>(nulls_at({3, 6}));
-  auto lists_member   = make_lists_member<T>(nulls_at({4, 5}));
-  auto structs_column = structs{{nums_member, lists_member}, no_nulls()}.release();
+  auto nums_member    = make_nums_member<T>(cudf::test::iterators::nulls_at({3, 6}));
+  auto lists_member   = make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5}));
+  auto structs_column = cudf::test::structs_column_wrapper{{nums_member, lists_member},
+                                                           cudf::test::iterators::no_nulls()}
+                          .release();
 
   // Reset STRUCTs' null-mask. Mark second STRUCT row as null.
   mark_row_as_null(structs_column->mutable_view(), 1);
@@ -522,7 +564,8 @@ TYPED_TEST(TypedSuperimposeTest, Struct_Sliced)
   // nums_member:  11011
   // lists_member: 00111
 
-  auto [output, backing_buffers] = cudf::structs::detail::superimpose_parent_nulls(sliced_structs);
+  auto [output, backing_buffers] =
+    cudf::structs::detail::superimpose_parent_nulls(sliced_structs, cudf::get_default_stream());
 
   // After superimpose_parent_nulls(), the null masks should be:
   // STRUCT:       11110
@@ -531,10 +574,11 @@ TYPED_TEST(TypedSuperimposeTest, Struct_Sliced)
 
   // Construct expected columns using structs_column_wrapper, which should push the parent nulls
   // down automatically. Then, slice() off the ends.
-  auto expected_nums             = make_nums_member<T>(nulls_at({1, 3, 6}));
-  auto expected_lists            = make_lists_member<T>(nulls_at({1, 4, 5}));
-  auto expected_unsliced_structs = structs{{expected_nums, expected_lists}, nulls_at({1})};
-  auto expected_structs          = slice_off_first_and_last_rows(expected_unsliced_structs);
+  auto expected_nums             = make_nums_member<T>(cudf::test::iterators::nulls_at({1, 3, 6}));
+  auto expected_lists            = make_lists_member<T>(cudf::test::iterators::nulls_at({1, 4, 5}));
+  auto expected_unsliced_structs = cudf::test::structs_column_wrapper{
+    {expected_nums, expected_lists}, cudf::test::iterators::nulls_at({1})};
+  auto expected_structs = slice_off_first_and_last_rows(expected_unsliced_structs);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output, expected_structs);
 }
@@ -547,10 +591,13 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_Sliced)
 
   using T = TypeParam;
 
-  auto nums_member           = make_nums_member<T>(nulls_at({3, 6}));
-  auto lists_member          = make_lists_member<T>(nulls_at({4, 5}));
-  auto structs_column        = structs{{nums_member, lists_member}, null_at(1)};
-  auto struct_structs_column = structs{{structs_column}, no_nulls()}.release();
+  auto nums_member    = make_nums_member<T>(cudf::test::iterators::nulls_at({3, 6}));
+  auto lists_member   = make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5}));
+  auto structs_column = cudf::test::structs_column_wrapper{{nums_member, lists_member},
+                                                           cudf::test::iterators::null_at(1)};
+  auto struct_structs_column =
+    cudf::test::structs_column_wrapper{{structs_column}, cudf::test::iterators::no_nulls()}
+      .release();
 
   // Reset STRUCT<STRUCT>'s null-mask. Mark third row as null.
   mark_row_as_null(struct_structs_column->mutable_view(), 2);
@@ -570,7 +617,8 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_Sliced)
   // nums_member:    11010
   // lists_member:   00110
 
-  auto [output, backing_buffers] = cudf::structs::detail::superimpose_parent_nulls(sliced_structs);
+  auto [output, backing_buffers] =
+    cudf::structs::detail::superimpose_parent_nulls(sliced_structs, cudf::get_default_stream());
 
   // After superimpose_parent_nulls(), the null masks will be:
   // STRUCT<STRUCT>: 11101
@@ -580,13 +628,13 @@ TYPED_TEST(TypedSuperimposeTest, NestedStruct_Sliced)
 
   // Construct expected columns using structs_column_wrapper, which should push the parent nulls
   // down automatically. Then, slice() off the ends.
-  auto expected_nums           = make_nums_member<T>(nulls_at({3, 6}));
-  auto expected_lists          = make_lists_member<T>(nulls_at({4, 5}));
-  auto expected_structs        = structs{{expected_nums, expected_lists}, nulls_at({1})};
-  auto expected_struct_structs = structs{{expected_structs}, null_at(2)};
+  auto expected_nums    = make_nums_member<T>(cudf::test::iterators::nulls_at({3, 6}));
+  auto expected_lists   = make_lists_member<T>(cudf::test::iterators::nulls_at({4, 5}));
+  auto expected_structs = cudf::test::structs_column_wrapper{{expected_nums, expected_lists},
+                                                             cudf::test::iterators::nulls_at({1})};
+  auto expected_struct_structs =
+    cudf::test::structs_column_wrapper{{expected_structs}, cudf::test::iterators::null_at(2)};
   auto expected_sliced_structs = slice_off_first_and_last_rows(expected_struct_structs);
 
   CUDF_TEST_EXPECT_COLUMNS_EQUIVALENT(output, expected_sliced_structs);
 }
-
-}  // namespace cudf::test
diff --git a/cpp/tests/table/experimental_row_operator_tests.cu b/cpp/tests/table/experimental_row_operator_tests.cu
index 0566f55e46d..427d819ace3 100644
--- a/cpp/tests/table/experimental_row_operator_tests.cu
+++ b/cpp/tests/table/experimental_row_operator_tests.cu
@@ -51,7 +51,7 @@ auto self_comparison(cudf::table_view input,
                      std::vector<cudf::order> const& column_order,
                      PhysicalElementComparator comparator)
 {
-  rmm::cuda_stream_view stream{cudf::default_stream_value};
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
 
   auto const table_comparator = lexicographic::self_comparator{input, column_order, {}, stream};
 
@@ -82,7 +82,7 @@ auto two_table_comparison(cudf::table_view lhs,
                           std::vector<cudf::order> const& column_order,
                           PhysicalElementComparator comparator)
 {
-  rmm::cuda_stream_view stream{cudf::default_stream_value};
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
 
   auto const table_comparator =
     lexicographic::two_table_comparator{lhs, rhs, column_order, {}, stream};
@@ -115,7 +115,7 @@ auto self_equality(cudf::table_view input,
                    std::vector<cudf::order> const& column_order,
                    PhysicalElementComparator comparator)
 {
-  rmm::cuda_stream_view stream{cudf::default_stream_value};
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
 
   auto const table_comparator = equality::self_comparator{input, stream};
   auto const equal_comparator =
@@ -139,7 +139,7 @@ auto two_table_equality(cudf::table_view lhs,
                         std::vector<cudf::order> const& column_order,
                         PhysicalElementComparator comparator)
 {
-  rmm::cuda_stream_view stream{cudf::default_stream_value};
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
 
   auto const table_comparator = equality::two_table_comparator{lhs, rhs, stream};
   auto const equal_comparator =
diff --git a/cpp/tests/table/table_view_tests.cu b/cpp/tests/table/table_view_tests.cu
index d678e659f79..0542d007ca0 100644
--- a/cpp/tests/table/table_view_tests.cu
+++ b/cpp/tests/table/table_view_tests.cu
@@ -43,11 +43,12 @@ void row_comparison(cudf::table_view input1,
                     cudf::mutable_column_view output,
                     std::vector<cudf::order> const& column_order)
 {
-  rmm::cuda_stream_view stream{cudf::default_stream_value};
+  rmm::cuda_stream_view stream{cudf::get_default_stream()};
 
   auto device_table_1 = cudf::table_device_view::create(input1, stream);
   auto device_table_2 = cudf::table_device_view::create(input2, stream);
-  auto d_column_order = cudf::detail::make_device_uvector_sync(column_order);
+  auto d_column_order =
+    cudf::detail::make_device_uvector_sync(column_order, cudf::get_default_stream());
 
   auto comparator = cudf::row_lexicographic_comparator(
     cudf::nullate::NO{}, *device_table_1, *device_table_2, d_column_order.data());
diff --git a/cpp/tests/text/ngrams_tests.cpp b/cpp/tests/text/ngrams_tests.cpp
index 20ffd3baa41..61bd1b3dccd 100644
--- a/cpp/tests/text/ngrams_tests.cpp
+++ b/cpp/tests/text/ngrams_tests.cpp
@@ -14,13 +14,13 @@
  * limitations under the License.
  */
 
-#include <cudf/column/column.hpp>
-#include <cudf/scalar/scalar.hpp>
-#include <cudf/strings/strings_column_view.hpp>
 #include <cudf_test/base_fixture.hpp>
 #include <cudf_test/column_utilities.hpp>
 #include <cudf_test/column_wrapper.hpp>
-#include <tests/strings/utilities.h>
+
+#include <cudf/column/column.hpp>
+#include <cudf/scalar/scalar.hpp>
+#include <cudf/strings/strings_column_view.hpp>
 
 #include <nvtext/generate_ngrams.hpp>
 
@@ -105,9 +105,9 @@ TEST_F(TextGenerateNgramsTest, Empty)
   cudf::column_view zero_size_strings_column(
     cudf::data_type{cudf::type_id::STRING}, 0, nullptr, nullptr, 0);
   auto results = nvtext::generate_ngrams(cudf::strings_column_view(zero_size_strings_column));
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
   results = nvtext::generate_character_ngrams(cudf::strings_column_view(zero_size_strings_column));
-  cudf::test::expect_strings_empty(results->view());
+  cudf::test::expect_column_empty(results->view());
 }
 
 TEST_F(TextGenerateNgramsTest, Errors)
diff --git a/cpp/tests/transform/integration/unary-transform-test.cpp b/cpp/tests/transform/integration/unary-transform-test.cpp
index 3f9088f9b4f..f935c83bc9c 100644
--- a/cpp/tests/transform/integration/unary-transform-test.cpp
+++ b/cpp/tests/transform/integration/unary-transform-test.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Copyright 2018-2019 BlazingDB, Inc.
  *     Copyright 2018 Christian Noboa Mardini <christian@blazingdb.com>
@@ -153,13 +153,13 @@ TEST_F(UnaryOperationIntegrationTest, Transform_INT8_INT8)
   const char cuda[] =
     R"***(
 __device__ inline void f(
-  signed char* output, 
+  signed char* output,
   signed char input
 ){
-	if(input > 96 && input < 123){	
+	if(input > 96 && input < 123){
   	*output = input - 32;
   }else{
-  	*output = input;    
+  	*output = input;
   }
 }
 )***";
diff --git a/cpp/tests/transform/row_bit_count_test.cu b/cpp/tests/transform/row_bit_count_test.cu
index 8151e0d6d8d..b1cfc7a39d1 100644
--- a/cpp/tests/transform/row_bit_count_test.cu
+++ b/cpp/tests/transform/row_bit_count_test.cu
@@ -53,7 +53,7 @@ TYPED_TEST(RowBitCountTyped, SimpleTypes)
   // expect size of the type per row
   auto expected = make_fixed_width_column(data_type{type_id::INT32}, 16);
   cudf::mutable_column_view mcv(*expected);
-  thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+  thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                mcv.begin<size_type>(),
                mcv.end<size_type>(),
                sizeof(device_storage_type_t<T>) * CHAR_BIT);
@@ -76,7 +76,7 @@ TYPED_TEST(RowBitCountTyped, SimpleTypesWithNulls)
   // expect size of the type + 1 bit per row
   auto expected = make_fixed_width_column(data_type{type_id::INT32}, 16);
   cudf::mutable_column_view mcv(*expected);
-  thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+  thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                mcv.begin<size_type>(),
                mcv.end<size_type>(),
                (sizeof(device_storage_type_t<T>) * CHAR_BIT) + 1);
@@ -240,7 +240,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // List child column = {0, 1, 2, 3, 4, ..., 2*num_rows};
   auto ints      = make_numeric_column(data_type{type_id::INT32}, num_rows * 2);
   auto ints_view = ints->mutable_view();
-  thrust::tabulate(rmm::exec_policy(cudf::default_stream_value),
+  thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()),
                    ints_view.begin<int32_t>(),
                    ints_view.end<int32_t>(),
                    thrust::identity{});
@@ -248,7 +248,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // List offsets = {0, 2, 4, 6, 8, ..., num_rows*2};
   auto list_offsets      = make_numeric_column(data_type{type_id::INT32}, num_rows + 1);
   auto list_offsets_view = list_offsets->mutable_view();
-  thrust::tabulate(rmm::exec_policy(cudf::default_stream_value),
+  thrust::tabulate(rmm::exec_policy(cudf::get_default_stream()),
                    list_offsets_view.begin<offset_type>(),
                    list_offsets_view.end<offset_type>(),
                    times_2{});
@@ -264,7 +264,7 @@ TEST_F(RowBitCount, StructsWithLists_RowsExceedingASingleBlock)
   // Compute row_bit_count, and compare.
   auto row_bit_counts          = row_bit_count(table_view{{structs_column->view()}});
   auto expected_row_bit_counts = make_numeric_column(data_type{type_id::INT32}, num_rows);
-  thrust::fill_n(rmm::exec_policy(cudf::default_stream_value),
+  thrust::fill_n(rmm::exec_policy(cudf::get_default_stream()),
                  expected_row_bit_counts->mutable_view().begin<int32_t>(),
                  num_rows,
                  CHAR_BIT * (2 * sizeof(int32_t) + sizeof(offset_type)));
@@ -613,7 +613,7 @@ TEST_F(RowBitCount, Table)
   auto expected   = cudf::make_fixed_width_column(data_type{type_id::INT32}, t.num_rows());
   cudf::mutable_column_view mcv(*expected);
   thrust::transform(
-    rmm::exec_policy(cudf::default_stream_value),
+    rmm::exec_policy(cudf::get_default_stream()),
     thrust::make_counting_iterator(0),
     thrust::make_counting_iterator(0) + t.num_rows(),
     mcv.begin<size_type>(),
diff --git a/cpp/tests/types/type_dispatcher_test.cu b/cpp/tests/types/type_dispatcher_test.cu
index 3280339ea85..911911851f2 100644
--- a/cpp/tests/types/type_dispatcher_test.cu
+++ b/cpp/tests/types/type_dispatcher_test.cu
@@ -69,11 +69,11 @@ __global__ void dispatch_test_kernel(cudf::type_id id, bool* d_result)
 
 TYPED_TEST(TypedDispatcherTest, DeviceDispatch)
 {
-  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1);
-  dispatch_test_kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>(
+  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1, cudf::get_default_stream());
+  dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
-  EXPECT_EQ(true, result.front_element(cudf::default_stream_value));
+  EXPECT_EQ(true, result.front_element(cudf::get_default_stream()));
 }
 
 struct IdDispatcherTest : public DispatcherTest, public testing::WithParamInterface<cudf::type_id> {
@@ -130,11 +130,11 @@ __global__ void double_dispatch_test_kernel(cudf::type_id id1, cudf::type_id id2
 
 TYPED_TEST(TypedDoubleDispatcherTest, DeviceDoubleDispatch)
 {
-  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1);
-  double_dispatch_test_kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>(
+  auto result = cudf::detail::make_zeroed_device_uvector_sync<bool>(1, cudf::get_default_stream());
+  double_dispatch_test_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(
     cudf::type_to_id<TypeParam>(), cudf::type_to_id<TypeParam>(), result.data());
   CUDF_CUDA_TRY(cudaDeviceSynchronize());
-  EXPECT_EQ(true, result.front_element(cudf::default_stream_value));
+  EXPECT_EQ(true, result.front_element(cudf::get_default_stream()));
 }
 
 struct IdDoubleDispatcherTest : public DispatcherTest,
diff --git a/cpp/tests/unary/cast_tests.cpp b/cpp/tests/unary/cast_tests.cpp
index fd9211a56e5..ac68a277622 100644
--- a/cpp/tests/unary/cast_tests.cpp
+++ b/cpp/tests/unary/cast_tests.cpp
@@ -90,70 +90,70 @@ inline cudf::column make_exp_chrono_column(cudf::type_id type_id)
         test_timestamps_D.size(),
         rmm::device_buffer{test_timestamps_D.data(),
                            test_timestamps_D.size() * sizeof(test_timestamps_D.front()),
-                           cudf::default_stream_value});
+                           cudf::get_default_stream()});
     case cudf::type_id::TIMESTAMP_SECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_s.size(),
         rmm::device_buffer{test_timestamps_s.data(),
                            test_timestamps_s.size() * sizeof(test_timestamps_s.front()),
-                           cudf::default_stream_value});
+                           cudf::get_default_stream()});
     case cudf::type_id::TIMESTAMP_MILLISECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_ms.size(),
         rmm::device_buffer{test_timestamps_ms.data(),
                            test_timestamps_ms.size() * sizeof(test_timestamps_ms.front()),
-                           cudf::default_stream_value});
+                           cudf::get_default_stream()});
     case cudf::type_id::TIMESTAMP_MICROSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_us.size(),
         rmm::device_buffer{test_timestamps_us.data(),
                            test_timestamps_us.size() * sizeof(test_timestamps_us.front()),
-                           cudf::default_stream_value});
+                           cudf::get_default_stream()});
     case cudf::type_id::TIMESTAMP_NANOSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_timestamps_ns.size(),
         rmm::device_buffer{test_timestamps_ns.data(),
                            test_timestamps_ns.size() * sizeof(test_timestamps_ns.front()),
-                           cudf::default_stream_value});
+                           cudf::get_default_stream()});
     case cudf::type_id::DURATION_DAYS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_D.size(),
         rmm::device_buffer{test_durations_D.data(),
                            test_durations_D.size() * sizeof(test_durations_D.front()),
-                           cudf::default_stream_value});
+                           cudf::get_default_stream()});
     case cudf::type_id::DURATION_SECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_s.size(),
         rmm::device_buffer{test_durations_s.data(),
                            test_durations_s.size() * sizeof(test_durations_s.front()),
-                           cudf::default_stream_value});
+                           cudf::get_default_stream()});
     case cudf::type_id::DURATION_MILLISECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_ms.size(),
         rmm::device_buffer{test_durations_ms.data(),
                            test_durations_ms.size() * sizeof(test_durations_ms.front()),
-                           cudf::default_stream_value});
+                           cudf::get_default_stream()});
     case cudf::type_id::DURATION_MICROSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_us.size(),
         rmm::device_buffer{test_durations_us.data(),
                            test_durations_us.size() * sizeof(test_durations_us.front()),
-                           cudf::default_stream_value});
+                           cudf::get_default_stream()});
     case cudf::type_id::DURATION_NANOSECONDS:
       return cudf::column(
         cudf::data_type{type_id},
         test_durations_ns.size(),
         rmm::device_buffer{test_durations_ns.data(),
                            test_durations_ns.size() * sizeof(test_durations_ns.front()),
-                           cudf::default_stream_value});
+                           cudf::get_default_stream()});
     default: CUDF_FAIL("Unsupported type_id");
   }
 };
diff --git a/cpp/tests/utilities/column_utilities.cu b/cpp/tests/utilities/column_utilities.cu
index 5106196a58f..417d4b4a6fc 100644
--- a/cpp/tests/utilities/column_utilities.cu
+++ b/cpp/tests/utilities/column_utilities.cu
@@ -66,7 +66,7 @@ std::unique_ptr<column> generate_all_row_indices(size_type num_rows)
 {
   auto indices =
     cudf::make_fixed_width_column(data_type{type_id::INT32}, num_rows, mask_state::UNALLOCATED);
-  thrust::sequence(rmm::exec_policy(cudf::default_stream_value),
+  thrust::sequence(rmm::exec_policy(cudf::get_default_stream()),
                    indices->mutable_view().begin<size_type>(),
                    indices->mutable_view().end<size_type>(),
                    0);
@@ -103,7 +103,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   // if we are checking for exact equality, we should be checking for "unsanitized" data that may
   // be hiding underneath nulls. so check all rows instead of just non-null rows
   if (check_exact_equality) {
-    return generate_all_row_indices(c.get_sliced_child(cudf::default_stream_value).size());
+    return generate_all_row_indices(c.get_sliced_child(cudf::get_default_stream()).size());
   }
 
   // Example input
@@ -132,7 +132,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
                ? (offsets[true_index + 1] - offsets[true_index])
                : 0;
     });
-  auto const output_size = thrust::reduce(rmm::exec_policy(cudf::default_stream_value),
+  auto const output_size = thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
                                           row_size_iter,
                                           row_size_iter + row_indices.size());
   // no output. done.
@@ -147,7 +147,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   //
   auto output_row_start = cudf::make_fixed_width_column(
     data_type{type_id::INT32}, row_indices.size(), mask_state::UNALLOCATED);
-  thrust::exclusive_scan(rmm::exec_policy(cudf::default_stream_value),
+  thrust::exclusive_scan(rmm::exec_policy(cudf::get_default_stream()),
                          row_size_iter,
                          row_size_iter + row_indices.size(),
                          output_row_start->mutable_view().begin<size_type>());
@@ -156,7 +156,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   //
   // result = [1, 1, 1, 1, 1]
   //
-  thrust::generate(rmm::exec_policy(cudf::default_stream_value),
+  thrust::generate(rmm::exec_policy(cudf::get_default_stream()),
                    result->mutable_view().begin<size_type>(),
                    result->mutable_view().end<size_type>(),
                    [] __device__() { return 1; });
@@ -171,11 +171,11 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
      offsets      = c.offsets().begin<offset_type>(),
      offset       = c.offset(),
      first_offset = cudf::detail::get_value<offset_type>(
-       c.offsets(), c.offset(), cudf::default_stream_value)] __device__(int index) {
+       c.offsets(), c.offset(), cudf::get_default_stream())] __device__(int index) {
       auto const true_index = row_indices[index] + offset;
       return offsets[true_index] - first_offset;
     });
-  thrust::scatter_if(rmm::exec_policy(cudf::default_stream_value),
+  thrust::scatter_if(rmm::exec_policy(cudf::get_default_stream()),
                      output_row_iter,
                      output_row_iter + row_indices.size(),
                      output_row_start->view().begin<size_type>(),
@@ -189,18 +189,18 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   //
   auto keys =
     cudf::make_fixed_width_column(data_type{type_id::INT32}, output_size, mask_state::UNALLOCATED);
-  thrust::generate(rmm::exec_policy(cudf::default_stream_value),
+  thrust::generate(rmm::exec_policy(cudf::get_default_stream()),
                    keys->mutable_view().begin<size_type>(),
                    keys->mutable_view().end<size_type>(),
                    [] __device__() { return 0; });
-  thrust::scatter_if(rmm::exec_policy(cudf::default_stream_value),
+  thrust::scatter_if(rmm::exec_policy(cudf::get_default_stream()),
                      row_size_iter,
                      row_size_iter + row_indices.size(),
                      output_row_start->view().begin<size_type>(),
                      row_size_iter,
                      keys->mutable_view().begin<size_type>(),
                      [] __device__(auto row_size) { return row_size != 0; });
-  thrust::inclusive_scan(rmm::exec_policy(cudf::default_stream_value),
+  thrust::inclusive_scan(rmm::exec_policy(cudf::get_default_stream()),
                          keys->view().begin<size_type>(),
                          keys->view().end<size_type>(),
                          keys->mutable_view().begin<size_type>());
@@ -213,7 +213,7 @@ std::unique_ptr<column> generate_child_row_indices(lists_column_view const& c,
   // output
   //    result = [6, 7, 11, 12, 13]
   //
-  thrust::inclusive_scan_by_key(rmm::exec_policy(cudf::default_stream_value),
+  thrust::inclusive_scan_by_key(rmm::exec_policy(cudf::get_default_stream()),
                                 keys->view().begin<size_type>(),
                                 keys->view().end<size_type>(),
                                 result->view().begin<size_type>(),
@@ -256,7 +256,7 @@ struct column_property_comparator {
         auto const true_index = row_indices[index] + offset;
         return !validity || cudf::bit_is_set(validity, true_index) ? 0 : 1;
       });
-    return thrust::reduce(rmm::exec_policy(cudf::default_stream_value),
+    return thrust::reduce(rmm::exec_policy(cudf::get_default_stream()),
                           validity_iter,
                           validity_iter + row_indices.size());
   }
@@ -328,8 +328,8 @@ struct column_property_comparator {
     auto lhs_child_indices =
       generate_child_row_indices(lhs_l, lhs_row_indices, check_exact_equality);
     if (lhs_child_indices->size() > 0) {
-      auto lhs_child = lhs_l.get_sliced_child(cudf::default_stream_value);
-      auto rhs_child = rhs_l.get_sliced_child(cudf::default_stream_value);
+      auto lhs_child = lhs_l.get_sliced_child(cudf::get_default_stream());
+      auto rhs_child = rhs_l.get_sliced_child(cudf::get_default_stream());
       auto rhs_child_indices =
         generate_child_row_indices(rhs_l, rhs_row_indices, check_exact_equality);
       return cudf::type_dispatcher(lhs_child.type(),
@@ -495,7 +495,7 @@ std::string stringify_column_differences(cudf::device_span<int const> difference
   CUDF_EXPECTS(not differences.empty(), "Shouldn't enter this function if `differences` is empty");
   std::string const depth_str = depth > 0 ? "depth " + std::to_string(depth) + '\n' : "";
   // move the differences to the host.
-  auto h_differences = cudf::detail::make_host_vector_sync(differences);
+  auto h_differences = cudf::detail::make_host_vector_sync(differences, cudf::get_default_stream());
   if (verbosity == debug_output_level::ALL_ERRORS) {
     std::ostringstream buffer;
     buffer << depth_str << "differences:" << std::endl;
@@ -516,9 +516,9 @@ std::string stringify_column_differences(cudf::device_span<int const> difference
     auto const index = h_differences[0];  // only stringify first difference
 
     auto const lhs_index =
-      cudf::detail::get_value<size_type>(lhs_row_indices, index, cudf::default_stream_value);
+      cudf::detail::get_value<size_type>(lhs_row_indices, index, cudf::get_default_stream());
     auto const rhs_index =
-      cudf::detail::get_value<size_type>(rhs_row_indices, index, cudf::default_stream_value);
+      cudf::detail::get_value<size_type>(rhs_row_indices, index, cudf::get_default_stream());
     auto diff_lhs = cudf::detail::slice(lhs, lhs_index, lhs_index + 1);
     auto diff_rhs = cudf::detail::slice(rhs, rhs_index, rhs_index + 1);
     return depth_str + "first difference: " + "lhs[" + std::to_string(index) +
@@ -549,17 +549,17 @@ struct column_comparator_impl {
                                               corresponding_rows_not_equivalent>;
 
     auto differences = rmm::device_uvector<int>(
-      lhs.size(), cudf::default_stream_value);  // worst case: everything different
+      lhs.size(), cudf::get_default_stream());  // worst case: everything different
     auto input_iter = thrust::make_counting_iterator(0);
     auto diff_iter  = thrust::copy_if(
-      rmm::exec_policy(cudf::default_stream_value),
+      rmm::exec_policy(cudf::get_default_stream()),
       input_iter,
       input_iter + lhs_row_indices.size(),
       differences.begin(),
       ComparatorType(*d_lhs, *d_rhs, *d_lhs_row_indices, *d_rhs_row_indices, fp_ulps));
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
-                       cudf::default_stream_value);  // shrink back down
+                       cudf::get_default_stream());  // shrink back down
 
     if (not differences.is_empty()) {
       if (verbosity != debug_output_level::QUIET) {
@@ -597,13 +597,13 @@ struct column_comparator_impl<list_view, check_exact_equality> {
     if (lhs_row_indices.is_empty()) { return true; }
 
     // worst case - everything is different
-    rmm::device_uvector<int> differences(lhs_row_indices.size(), cudf::default_stream_value);
+    rmm::device_uvector<int> differences(lhs_row_indices.size(), cudf::get_default_stream());
 
     // compare offsets, taking slicing into account
 
     // left side
     size_type lhs_shift = cudf::detail::get_value<size_type>(
-      lhs_l.offsets(), lhs_l.offset(), cudf::default_stream_value);
+      lhs_l.offsets(), lhs_l.offset(), cudf::get_default_stream());
     auto lhs_offsets = thrust::make_transform_iterator(
       lhs_l.offsets().begin<size_type>() + lhs_l.offset(),
       [lhs_shift] __device__(size_type offset) { return offset - lhs_shift; });
@@ -615,7 +615,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
 
     // right side
     size_type rhs_shift = cudf::detail::get_value<size_type>(
-      rhs_l.offsets(), rhs_l.offset(), cudf::default_stream_value);
+      rhs_l.offsets(), rhs_l.offset(), cudf::get_default_stream());
     auto rhs_offsets = thrust::make_transform_iterator(
       rhs_l.offsets().begin<size_type>() + rhs_l.offset(),
       [rhs_shift] __device__(size_type offset) { return offset - rhs_shift; });
@@ -643,7 +643,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
     //
     auto input_iter = thrust::make_counting_iterator(0);
     auto diff_iter  = thrust::copy_if(
-      rmm::exec_policy(cudf::default_stream_value),
+      rmm::exec_policy(cudf::get_default_stream()),
       input_iter,
       input_iter + lhs_row_indices.size(),
       differences.begin(),
@@ -679,7 +679,7 @@ struct column_comparator_impl<list_view, check_exact_equality> {
       });
 
     differences.resize(thrust::distance(differences.begin(), diff_iter),
-                       cudf::default_stream_value);  // shrink back down
+                       cudf::get_default_stream());  // shrink back down
 
     if (not differences.is_empty()) {
       if (verbosity != debug_output_level::QUIET) {
@@ -698,8 +698,8 @@ struct column_comparator_impl<list_view, check_exact_equality> {
     auto lhs_child_indices =
       generate_child_row_indices(lhs_l, lhs_row_indices, check_exact_equality);
     if (lhs_child_indices->size() > 0) {
-      auto lhs_child = lhs_l.get_sliced_child(cudf::default_stream_value);
-      auto rhs_child = rhs_l.get_sliced_child(cudf::default_stream_value);
+      auto lhs_child = lhs_l.get_sliced_child(cudf::get_default_stream());
+      auto rhs_child = rhs_l.get_sliced_child(cudf::get_default_stream());
       auto rhs_child_indices =
         generate_child_row_indices(rhs_l, rhs_row_indices, check_exact_equality);
       return cudf::type_dispatcher(lhs_child.type(),
@@ -854,6 +854,15 @@ bool expect_columns_equivalent(cudf::column_view const& lhs,
                                fp_ulps);
 }
 
+/**
+ * @copydoc cudf::test::expect_column_empty
+ */
+void expect_column_empty(cudf::column_view const& col)
+{
+  EXPECT_EQ(0, col.size());
+  EXPECT_EQ(0, col.null_count());
+}
+
 /**
  * @copydoc cudf::test::expect_equal_buffers
  */
@@ -866,7 +875,7 @@ void expect_equal_buffers(void const* lhs, void const* rhs, std::size_t size_byt
   auto typed_lhs = static_cast<char const*>(lhs);
   auto typed_rhs = static_cast<char const*>(rhs);
   EXPECT_TRUE(thrust::equal(
-    rmm::exec_policy(cudf::default_stream_value), typed_lhs, typed_lhs + size_bytes, typed_rhs));
+    rmm::exec_policy(cudf::get_default_stream()), typed_lhs, typed_lhs + size_bytes, typed_rhs));
 }
 
 /**
@@ -955,19 +964,20 @@ std::string nested_offsets_to_string(NestedColumnView const& c, std::string cons
 
   // the first offset value to normalize everything against
   size_type first =
-    cudf::detail::get_value<size_type>(offsets, c.offset(), cudf::default_stream_value);
-  rmm::device_uvector<size_type> shifted_offsets(output_size, cudf::default_stream_value);
+    cudf::detail::get_value<size_type>(offsets, c.offset(), cudf::get_default_stream());
+  rmm::device_uvector<size_type> shifted_offsets(output_size, cudf::get_default_stream());
 
   // normalize the offset values for the column offset
   size_type const* d_offsets = offsets.head<size_type>() + c.offset();
   thrust::transform(
-    rmm::exec_policy(cudf::default_stream_value),
+    rmm::exec_policy(cudf::get_default_stream()),
     d_offsets,
     d_offsets + output_size,
     shifted_offsets.begin(),
     [first] __device__(int32_t offset) { return static_cast<size_type>(offset - first); });
 
-  auto const h_shifted_offsets = cudf::detail::make_host_vector_sync(shifted_offsets);
+  auto const h_shifted_offsets =
+    cudf::detail::make_host_vector_sync(shifted_offsets, cudf::get_default_stream());
   std::ostringstream buffer;
   for (size_t idx = 0; idx < h_shifted_offsets.size(); idx++) {
     buffer << h_shifted_offsets[idx];
@@ -1137,7 +1147,7 @@ struct column_view_printer {
     lists_column_view lcv(col);
 
     // propagate slicing to the child if necessary
-    column_view child    = lcv.get_sliced_child(cudf::default_stream_value);
+    column_view child    = lcv.get_sliced_child(cudf::get_default_stream());
     bool const is_sliced = lcv.offset() > 0 || child.offset() > 0;
 
     std::string tmp =
diff --git a/cpp/tests/utilities/identify_stream_usage/CMakeLists.txt b/cpp/tests/utilities/identify_stream_usage/CMakeLists.txt
new file mode 100644
index 00000000000..89f40303550
--- /dev/null
+++ b/cpp/tests/utilities/identify_stream_usage/CMakeLists.txt
@@ -0,0 +1,60 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+
+cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
+
+if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake)
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/RAPIDS.cmake
+       ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake
+  )
+endif()
+include(${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake)
+
+project(
+  IDENTIFY_STREAM_USAGE
+  VERSION 0.0.1
+  LANGUAGES CXX CUDA
+)
+
+include(rapids-cpm)
+include(${rapids-cmake-dir}/cpm/rmm.cmake)
+rapids_cpm_init()
+rapids_cpm_rmm()
+
+set(CMAKE_CUDA_RUNTIME_LIBRARY SHARED)
+add_library(identify_stream_usage SHARED identify_stream_usage.cpp)
+
+find_package(CUDAToolkit REQUIRED)
+
+set_target_properties(identify_stream_usage PROPERTIES CUDA_RUNTIME_LIBRARY SHARED)
+target_link_libraries(identify_stream_usage PUBLIC CUDA::cudart rmm::rmm)
+
+set_target_properties(
+  identify_stream_usage
+  PROPERTIES # set target compile options
+             CXX_STANDARD 17
+             CXX_STANDARD_REQUIRED ON
+             POSITION_INDEPENDENT_CODE ON
+)
+
+# Add the test file.
+include(CTest)
+
+add_executable(Tests test_default_stream_identification.cu)
+add_test(NAME default_stream_identification COMMAND Tests)
+
+set_tests_properties(
+  default_stream_identification PROPERTIES ENVIRONMENT
+                                           LD_PRELOAD=$<TARGET_FILE:identify_stream_usage>
+)
diff --git a/cpp/tests/utilities/identify_stream_usage/identify_stream_usage.cpp b/cpp/tests/utilities/identify_stream_usage/identify_stream_usage.cpp
new file mode 100644
index 00000000000..4a1a8f04791
--- /dev/null
+++ b/cpp/tests/utilities/identify_stream_usage/identify_stream_usage.cpp
@@ -0,0 +1,322 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <rmm/cuda_stream.hpp>
+#include <rmm/cuda_stream_view.hpp>
+
+#include <cuda_runtime.h>
+
+#include <cxxabi.h>
+#include <dlfcn.h>
+#include <execinfo.h>
+#include <iostream>
+#include <stdexcept>
+#include <unordered_map>
+
+/**
+ * @brief Print a backtrace and raise an error if stream is a default stream.
+ */
+void check_stream_and_error(cudaStream_t stream)
+{
+  // We explicitly list the possibilities rather than using
+  // `cudf::get_default_stream().value()` for two reasons:
+  // 1. There is no guarantee that `thrust::device` and the default value of
+  //    `cudf::get_default_stream().value()` are actually the same. At present,
+  //    the former is `cudaStreamLegacy` while the latter is 0.
+  // 2. Using the cudf default stream would require linking against cudf, which
+  //    adds unnecessary complexity to the build process (especially in CI)
+  //    when this simple approach is sufficient.
+  if (stream == cudaStreamDefault || (stream == cudaStreamLegacy) ||
+      (stream == cudaStreamPerThread)) {
+#ifdef __GNUC__
+    // If we're on the wrong stream, print the stack trace from the current frame.
+    // Adapted from from https://panthema.net/2008/0901-stacktrace-demangled/
+    constexpr int kMaxStackDepth = 64;
+    void* stack[kMaxStackDepth];
+    auto depth   = backtrace(stack, kMaxStackDepth);
+    auto strings = backtrace_symbols(stack, depth);
+
+    if (strings == nullptr) {
+      std::cout << "No stack trace could be found!" << std::endl;
+    } else {
+      // If we were able to extract a trace, parse it, demangle symbols, and
+      // print a readable output.
+
+      // allocate string which will be filled with the demangled function name
+      size_t funcnamesize = 256;
+      char* funcname      = (char*)malloc(funcnamesize);
+
+      // Start at frame 1 to skip print_trace itself.
+      for (int i = 1; i < depth; ++i) {
+        char* begin_name   = nullptr;
+        char* begin_offset = nullptr;
+        char* end_offset   = nullptr;
+
+        // find parentheses and +address offset surrounding the mangled name:
+        // ./module(function+0x15c) [0x8048a6d]
+        for (char* p = strings[i]; *p; ++p) {
+          if (*p == '(') {
+            begin_name = p;
+          } else if (*p == '+') {
+            begin_offset = p;
+          } else if (*p == ')' && begin_offset) {
+            end_offset = p;
+            break;
+          }
+        }
+
+        if (begin_name && begin_offset && end_offset && begin_name < begin_offset) {
+          *begin_name++   = '\0';
+          *begin_offset++ = '\0';
+          *end_offset     = '\0';
+
+          // mangled name is now in [begin_name, begin_offset) and caller offset
+          // in [begin_offset, end_offset). now apply __cxa_demangle():
+
+          int status;
+          char* ret = abi::__cxa_demangle(begin_name, funcname, &funcnamesize, &status);
+          if (status == 0) {
+            funcname =
+              ret;  // use possibly realloc()-ed string (__cxa_demangle may realloc funcname)
+            std::cout << "#" << i << " in " << strings[i] << " : " << funcname << "+"
+                      << begin_offset << std::endl;
+          } else {
+            // demangling failed. Output function name as a C function with no arguments.
+            std::cout << "#" << i << " in " << strings[i] << " : " << begin_name << "()+"
+                      << begin_offset << std::endl;
+          }
+        } else {
+          std::cout << "#" << i << " in " << strings[i] << std::endl;
+        }
+      }
+
+      free(funcname);
+    }
+    free(strings);
+#else
+    std::cout << "Backtraces are only when built with a GNU compiler." << std::endl;
+#endif  // __GNUC__
+    throw std::runtime_error("Found unexpected default stream!");
+  }
+}
+
+/**
+ * @brief Container for CUDA APIs that have been overloaded using DEFINE_OVERLOAD.
+ *
+ * This variable must be initialized before everything else.
+ *
+ * @see find_originals for a description of the priorities
+ */
+__attribute__((init_priority(1001))) std::unordered_map<std::string, void*> originals;
+
+/**
+ * @brief Macro for generating functions to override existing CUDA functions.
+ *
+ * Define a new function with the provided signature that checks the used
+ * stream and raises an exception if it is one of CUDA's default streams. If
+ * not, the new function forwards all arguments to the original function.
+ *
+ * Note that since this only defines the function, we do not need default
+ * parameter values since those will be provided by the original declarations
+ * in CUDA itself.
+ *
+ * @see find_originals for a description of the priorities
+ *
+ * @param function The function to overload.
+ * @param signature The function signature (must include names, not just types).
+ * @parameter arguments The function arguments (names only, no types).
+ */
+#define DEFINE_OVERLOAD(function, signature, arguments)     \
+  using function##_t = cudaError_t (*)(signature);          \
+                                                            \
+  cudaError_t function(signature)                           \
+  {                                                         \
+    check_stream_and_error(stream);                         \
+    return ((function##_t)originals[#function])(arguments); \
+  }                                                         \
+  __attribute__((constructor(1002))) void queue_##function() { originals[#function] = nullptr; }
+
+/**
+ * @brief Helper macro to define macro arguments that contain a comma.
+ */
+#define ARG(...) __VA_ARGS__
+
+// clang-format off
+/*
+   We need to overload all the functions from the runtime API (assuming that we
+   don't use the driver API) that accept streams. The main webpage for APIs is
+   https://docs.nvidia.com/cuda/cuda-runtime-api/modules.html#modules. Here are
+   the modules containing any APIs using streams as of 9/20/2022:
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__STREAM.html
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT - Done
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXTRES__INTEROP.html#group__CUDART__EXTRES__INTEROP
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html#group__CUDART__EXECUTION - Done
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY - Done
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html#group__CUDART__MEMORY__POOLS - Done
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__OPENGL__DEPRECATED.html#group__CUDART__OPENGL__DEPRECATED
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EGL.html#group__CUDART__EGL
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__INTEROP.html#group__CUDART__INTEROP
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__GRAPH.html#group__CUDART__GRAPH
+   - https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__HIGHLEVEL.html#group__CUDART__HIGHLEVEL
+ */
+// clang-format on
+
+// Event APIS:
+// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EVENT.html#group__CUDART__EVENT
+DEFINE_OVERLOAD(cudaEventRecord, ARG(cudaEvent_t event, cudaStream_t stream), ARG(event, stream));
+
+DEFINE_OVERLOAD(cudaEventRecordWithFlags,
+                ARG(cudaEvent_t event, cudaStream_t stream, unsigned int flags),
+                ARG(event, stream, flags));
+
+// Execution APIS:
+// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__EXECUTION.html#group__CUDART__EXECUTION
+DEFINE_OVERLOAD(cudaLaunchKernel,
+                ARG(const void* func,
+                    dim3 gridDim,
+                    dim3 blockDim,
+                    void** args,
+                    size_t sharedMem,
+                    cudaStream_t stream),
+                ARG(func, gridDim, blockDim, args, sharedMem, stream));
+DEFINE_OVERLOAD(cudaLaunchCooperativeKernel,
+                ARG(const void* func,
+                    dim3 gridDim,
+                    dim3 blockDim,
+                    void** args,
+                    size_t sharedMem,
+                    cudaStream_t stream),
+                ARG(func, gridDim, blockDim, args, sharedMem, stream));
+DEFINE_OVERLOAD(cudaLaunchHostFunc,
+                ARG(cudaStream_t stream, cudaHostFn_t fn, void* userData),
+                ARG(stream, fn, userData));
+
+// Memory transfer APIS:
+// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY.html#group__CUDART__MEMORY
+DEFINE_OVERLOAD(cudaMemPrefetchAsync,
+                ARG(const void* devPtr, size_t count, int dstDevice, cudaStream_t stream),
+                ARG(devPtr, count, dstDevice, stream));
+DEFINE_OVERLOAD(cudaMemcpy2DAsync,
+                ARG(void* dst,
+                    size_t dpitch,
+                    const void* src,
+                    size_t spitch,
+                    size_t width,
+                    size_t height,
+                    cudaMemcpyKind kind,
+                    cudaStream_t stream),
+                ARG(dst, dpitch, src, spitch, width, height, kind, stream));
+DEFINE_OVERLOAD(cudaMemcpy2DFromArrayAsync,
+                ARG(void* dst,
+                    size_t dpitch,
+                    cudaArray_const_t src,
+                    size_t wOffset,
+                    size_t hOffset,
+                    size_t width,
+                    size_t height,
+                    cudaMemcpyKind kind,
+                    cudaStream_t stream),
+                ARG(dst, dpitch, src, wOffset, hOffset, width, height, kind, stream));
+DEFINE_OVERLOAD(cudaMemcpy2DToArrayAsync,
+                ARG(cudaArray_t dst,
+                    size_t wOffset,
+                    size_t hOffset,
+                    const void* src,
+                    size_t spitch,
+                    size_t width,
+                    size_t height,
+                    cudaMemcpyKind kind,
+                    cudaStream_t stream),
+                ARG(dst, wOffset, hOffset, src, spitch, width, height, kind, stream));
+DEFINE_OVERLOAD(cudaMemcpy3DAsync,
+                ARG(const cudaMemcpy3DParms* p, cudaStream_t stream),
+                ARG(p, stream));
+DEFINE_OVERLOAD(cudaMemcpy3DPeerAsync,
+                ARG(const cudaMemcpy3DPeerParms* p, cudaStream_t stream),
+                ARG(p, stream));
+DEFINE_OVERLOAD(
+  cudaMemcpyAsync,
+  ARG(void* dst, const void* src, size_t count, cudaMemcpyKind kind, cudaStream_t stream),
+  ARG(dst, src, count, kind, stream));
+DEFINE_OVERLOAD(cudaMemcpyFromSymbolAsync,
+                ARG(void* dst,
+                    const void* symbol,
+                    size_t count,
+                    size_t offset,
+                    cudaMemcpyKind kind,
+                    cudaStream_t stream),
+                ARG(dst, symbol, count, offset, kind, stream));
+DEFINE_OVERLOAD(cudaMemcpyToSymbolAsync,
+                ARG(const void* symbol,
+                    const void* src,
+                    size_t count,
+                    size_t offset,
+                    cudaMemcpyKind kind,
+                    cudaStream_t stream),
+                ARG(symbol, src, count, offset, kind, stream));
+DEFINE_OVERLOAD(
+  cudaMemset2DAsync,
+  ARG(void* devPtr, size_t pitch, int value, size_t width, size_t height, cudaStream_t stream),
+  ARG(devPtr, pitch, value, width, height, stream));
+DEFINE_OVERLOAD(
+  cudaMemset3DAsync,
+  ARG(cudaPitchedPtr pitchedDevPtr, int value, cudaExtent extent, cudaStream_t stream),
+  ARG(pitchedDevPtr, value, extent, stream));
+DEFINE_OVERLOAD(cudaMemsetAsync,
+                ARG(void* devPtr, int value, size_t count, cudaStream_t stream),
+                ARG(devPtr, value, count, stream));
+
+// Memory allocation APIS:
+// https://docs.nvidia.com/cuda/cuda-runtime-api/group__CUDART__MEMORY__POOLS.html#group__CUDART__MEMORY__POOLS
+DEFINE_OVERLOAD(cudaFreeAsync, ARG(void* devPtr, cudaStream_t stream), ARG(devPtr, stream));
+DEFINE_OVERLOAD(cudaMallocAsync,
+                ARG(void** devPtr, size_t size, cudaStream_t stream),
+                ARG(devPtr, size, stream));
+DEFINE_OVERLOAD(cudaMallocFromPoolAsync,
+                ARG(void** ptr, size_t size, cudaMemPool_t memPool, cudaStream_t stream),
+                ARG(ptr, size, memPool, stream));
+
+namespace cudf {
+
+/**
+ * @brief Get the current default stream
+ *
+ * Overload the default function to return a new stream here.
+ *
+ * @return The current default stream.
+ */
+rmm::cuda_stream_view const get_default_stream()
+{
+  static rmm::cuda_stream stream{};
+  return {stream};
+}
+
+}  // namespace cudf
+
+/**
+ * @brief Function to collect all the original CUDA symbols corresponding to overloaded functions.
+ *
+ * Note on priorities:
+ * - `originals` must be initialized first, so it is 1001.
+ * - The function names must be added to originals next in the macro, so those are 1002.
+ * - Finally, this function actually finds the original symbols so it is 1003.
+ */
+__attribute__((constructor(1003))) void find_originals()
+{
+  for (auto it : originals) {
+    originals[it.first] = dlsym(RTLD_NEXT, it.first.data());
+  }
+}
diff --git a/cpp/tests/utilities/identify_stream_usage/test_default_stream_identification.cu b/cpp/tests/utilities/identify_stream_usage/test_default_stream_identification.cu
new file mode 100644
index 00000000000..022244b148b
--- /dev/null
+++ b/cpp/tests/utilities/identify_stream_usage/test_default_stream_identification.cu
@@ -0,0 +1,39 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <stdexcept>
+
+__global__ void kernel() { printf("The kernel ran!\n"); }
+
+void test_cudaLaunchKernel()
+{
+  cudaStream_t stream;
+  cudaStreamCreate(&stream);
+  kernel<<<1, 1, 0, stream>>>();
+  cudaError_t err{cudaDeviceSynchronize()};
+  if (err != cudaSuccess) { throw std::runtime_error("Kernel failed on non-default stream!"); }
+  err = cudaGetLastError();
+  if (err != cudaSuccess) { throw std::runtime_error("Kernel failed on non-default stream!"); }
+
+  try {
+    kernel<<<1, 1>>>();
+  } catch (std::runtime_error) {
+    return;
+  }
+  throw std::runtime_error("No exception raised for kernel on default stream!");
+}
+
+int main() { test_cudaLaunchKernel(); }
diff --git a/cpp/tests/quantiles/tdigest_utilities.cu b/cpp/tests/utilities/tdigest_utilities.cu
similarity index 77%
rename from cpp/tests/quantiles/tdigest_utilities.cu
rename to cpp/tests/utilities/tdigest_utilities.cu
index 63ccd85bd6d..beed9893d71 100644
--- a/cpp/tests/quantiles/tdigest_utilities.cu
+++ b/cpp/tests/utilities/tdigest_utilities.cu
@@ -17,7 +17,7 @@
 #include <cudf/column/column_factories.hpp>
 #include <cudf/concatenate.hpp>
 #include <cudf/detail/tdigest/tdigest.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 #include <cudf_test/column_utilities.hpp>
@@ -51,13 +51,34 @@ void tdigest_sample_compare(cudf::tdigest::tdigest_column_view const& tdv,
   auto sampled_result_weight = cudf::make_fixed_width_column(
     data_type{type_id::FLOAT64}, h_expected.size(), mask_state::UNALLOCATED);
 
-  rmm::device_vector<expected_value> expected(h_expected.begin(), h_expected.end());
+  auto h_expected_src    = std::vector<size_type>(h_expected.size());
+  auto h_expected_mean   = std::vector<double>(h_expected.size());
+  auto h_expected_weight = std::vector<double>(h_expected.size());
+
+  {
+    auto iter = thrust::make_counting_iterator(0);
+    std::for_each_n(iter, h_expected.size(), [&](size_type const index) {
+      h_expected_src[index]    = thrust::get<0>(h_expected[index]);
+      h_expected_mean[index]   = thrust::get<1>(h_expected[index]);
+      h_expected_weight[index] = thrust::get<2>(h_expected[index]);
+    });
+  }
+
+  auto d_expected_src =
+    cudf::detail::make_device_uvector_async(h_expected_src, cudf::get_default_stream());
+  auto d_expected_mean =
+    cudf::detail::make_device_uvector_async(h_expected_mean, cudf::get_default_stream());
+  auto d_expected_weight =
+    cudf::detail::make_device_uvector_async(h_expected_weight, cudf::get_default_stream());
+
   auto iter = thrust::make_counting_iterator(0);
   thrust::for_each(
-    rmm::exec_policy(cudf::default_stream_value),
+    rmm::exec_policy(cudf::get_default_stream()),
     iter,
-    iter + expected.size(),
-    [expected            = expected.data().get(),
+    iter + h_expected.size(),
+    [expected_src_in     = d_expected_src.data(),
+     expected_mean_in    = d_expected_mean.data(),
+     expected_weight_in  = d_expected_weight.data(),
      expected_mean       = expected_mean->mutable_view().begin<double>(),
      expected_weight     = expected_weight->mutable_view().begin<double>(),
      result_mean         = result_mean.begin<double>(),
@@ -65,9 +86,9 @@ void tdigest_sample_compare(cudf::tdigest::tdigest_column_view const& tdv,
      sampled_result_mean = sampled_result_mean->mutable_view().begin<double>(),
      sampled_result_weight =
        sampled_result_weight->mutable_view().begin<double>()] __device__(size_type index) {
-      expected_mean[index]         = thrust::get<1>(expected[index]);
-      expected_weight[index]       = thrust::get<2>(expected[index]);
-      auto const src_index         = thrust::get<0>(expected[index]);
+      expected_mean[index]         = expected_mean_in[index];
+      expected_weight[index]       = expected_weight_in[index];
+      auto const src_index         = expected_src_in[index];
       sampled_result_mean[index]   = result_mean[src_index];
       sampled_result_weight[index] = result_weight[src_index];
     });
@@ -101,13 +122,13 @@ std::unique_ptr<column> make_expected_tdigest_column(std::vector<expected_tdiges
 
     auto min_col =
       cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
-    thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+    thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                  min_col->mutable_view().begin<double>(),
                  min_col->mutable_view().end<double>(),
                  tdigest.min);
     auto max_col =
       cudf::make_fixed_width_column(data_type{type_id::FLOAT64}, 1, mask_state::UNALLOCATED);
-    thrust::fill(rmm::exec_policy(cudf::default_stream_value),
+    thrust::fill(rmm::exec_policy(cudf::get_default_stream()),
                  max_col->mutable_view().begin<double>(),
                  max_col->mutable_view().end<double>(),
                  tdigest.max);
diff --git a/cpp/tests/utilities_tests/span_tests.cu b/cpp/tests/utilities_tests/span_tests.cu
index fc4104c765b..a043e723eda 100644
--- a/cpp/tests/utilities_tests/span_tests.cu
+++ b/cpp/tests/utilities_tests/span_tests.cu
@@ -14,6 +14,7 @@
  * limitations under the License.
  */
 
+#include <cudf/detail/utilities/vector_factories.hpp>
 #include <cudf/utilities/default_stream.hpp>
 #include <cudf/utilities/span.hpp>
 #include <io/utilities/hostdevice_vector.hpp>
@@ -211,11 +212,14 @@ TEST(SpanTest, CanConstructFromHostContainers)
   (void)host_span<int const>(h_vector_c);
 }
 
+// This test is the only place in libcudf's test suite where using a
+// thrust::device_vector (and therefore the CUDA default stream) is acceptable
+// since we are explicitly testing conversions from thrust::device_vector.
 TEST(SpanTest, CanConstructFromDeviceContainers)
 {
   auto d_thrust_vector = thrust::device_vector<int>(1);
   auto d_vector        = rmm::device_vector<int>(1);
-  auto d_uvector       = rmm::device_uvector<int>(1, cudf::default_stream_value);
+  auto d_uvector       = rmm::device_uvector<int>(1, cudf::get_default_stream());
 
   (void)device_span<int>(d_thrust_vector);
   (void)device_span<int>(d_vector);
@@ -234,17 +238,14 @@ __global__ void simple_device_kernel(device_span<bool> result) { result[0] = tru
 
 TEST(SpanTest, CanUseDeviceSpan)
 {
-  rmm::device_vector<bool> d_message = std::vector<bool>({false});
+  auto d_message =
+    cudf::detail::make_zeroed_device_uvector_async<bool>(1, cudf::get_default_stream());
 
-  auto d_span = device_span<bool>(d_message.data().get(), d_message.size());
+  auto d_span = device_span<bool>(d_message.data(), d_message.size());
 
-  simple_device_kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>(d_span);
+  simple_device_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(d_span);
 
-  cudaDeviceSynchronize();
-
-  thrust::host_vector<bool> h_message = d_message;
-
-  ASSERT_TRUE(h_message[0]);
+  ASSERT_TRUE(d_message.element(0, cudf::get_default_stream()));
 }
 
 class MdSpanTest : public cudf::test::BaseFixture {
@@ -252,9 +253,9 @@ class MdSpanTest : public cudf::test::BaseFixture {
 
 TEST(MdSpanTest, CanDetermineEmptiness)
 {
-  auto const vector            = hostdevice_2dvector<int>(1, 2, cudf::default_stream_value);
-  auto const no_rows_vector    = hostdevice_2dvector<int>(0, 2, cudf::default_stream_value);
-  auto const no_columns_vector = hostdevice_2dvector<int>(1, 0, cudf::default_stream_value);
+  auto const vector            = hostdevice_2dvector<int>(1, 2, cudf::get_default_stream());
+  auto const no_rows_vector    = hostdevice_2dvector<int>(0, 2, cudf::get_default_stream());
+  auto const no_columns_vector = hostdevice_2dvector<int>(1, 0, cudf::get_default_stream());
 
   EXPECT_FALSE(host_2dspan<int const>{vector}.is_empty());
   EXPECT_FALSE(device_2dspan<int const>{vector}.is_empty());
@@ -275,17 +276,17 @@ __global__ void readwrite_kernel(device_2dspan<int> result)
 
 TEST(MdSpanTest, DeviceReadWrite)
 {
-  auto vector = hostdevice_2dvector<int>(11, 23, cudf::default_stream_value);
+  auto vector = hostdevice_2dvector<int>(11, 23, cudf::get_default_stream());
 
-  readwrite_kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>(vector);
-  readwrite_kernel<<<1, 1, 0, cudf::default_stream_value.value()>>>(vector);
-  vector.device_to_host(cudf::default_stream_value, true);
+  readwrite_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(vector);
+  readwrite_kernel<<<1, 1, 0, cudf::get_default_stream().value()>>>(vector);
+  vector.device_to_host(cudf::get_default_stream(), true);
   EXPECT_EQ(vector[5][6], 30);
 }
 
 TEST(MdSpanTest, HostReadWrite)
 {
-  auto vector = hostdevice_2dvector<int>(11, 23, cudf::default_stream_value);
+  auto vector = hostdevice_2dvector<int>(11, 23, cudf::get_default_stream());
   auto span   = host_2dspan<int>{vector};
   span[5][6]  = 5;
   if (span[5][6] == 5) { span[5][6] *= 6; }
@@ -295,7 +296,7 @@ TEST(MdSpanTest, HostReadWrite)
 
 TEST(MdSpanTest, CanGetSize)
 {
-  auto const vector = hostdevice_2dvector<int>(1, 2, cudf::default_stream_value);
+  auto const vector = hostdevice_2dvector<int>(1, 2, cudf::get_default_stream());
 
   EXPECT_EQ(host_2dspan<int const>{vector}.size(), vector.size());
   EXPECT_EQ(device_2dspan<int const>{vector}.size(), vector.size());
@@ -303,7 +304,7 @@ TEST(MdSpanTest, CanGetSize)
 
 TEST(MdSpanTest, CanGetCount)
 {
-  auto const vector = hostdevice_2dvector<int>(11, 23, cudf::default_stream_value);
+  auto const vector = hostdevice_2dvector<int>(11, 23, cudf::get_default_stream());
 
   EXPECT_EQ(host_2dspan<int const>{vector}.count(), 11ul * 23);
   EXPECT_EQ(device_2dspan<int const>{vector}.count(), 11ul * 23);
diff --git a/cpp/tests/wrappers/timestamps_test.cu b/cpp/tests/wrappers/timestamps_test.cu
index 73bfd15744a..9aad90788e0 100644
--- a/cpp/tests/wrappers/timestamps_test.cu
+++ b/cpp/tests/wrappers/timestamps_test.cu
@@ -38,7 +38,7 @@
 
 template <typename T>
 struct ChronoColumnTest : public cudf::test::BaseFixture {
-  rmm::cuda_stream_view stream() { return cudf::default_stream_value; }
+  rmm::cuda_stream_view stream() { return cudf::get_default_stream(); }
   cudf::size_type size() { return cudf::size_type(100); }
   cudf::data_type type() { return cudf::data_type{cudf::type_to_id<T>()}; }
 };
@@ -93,9 +93,9 @@ TYPED_TEST(ChronoColumnTest, ChronoDurationsMatchPrimitiveRepresentation)
   auto primitive_col =
     fixed_width_column_wrapper<Rep>(chrono_col_data.begin(), chrono_col_data.end());
 
-  rmm::device_uvector<int32_t> indices(this->size(), cudf::default_stream_value);
-  thrust::sequence(rmm::exec_policy(cudf::default_stream_value), indices.begin(), indices.end());
-  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::default_stream_value),
+  rmm::device_uvector<int32_t> indices(this->size(), cudf::get_default_stream());
+  thrust::sequence(rmm::exec_policy(cudf::get_default_stream()), indices.begin(), indices.end());
+  EXPECT_TRUE(thrust::all_of(rmm::exec_policy(cudf::get_default_stream()),
                              indices.begin(),
                              indices.end(),
                              compare_chrono_elements_to_primitive_representation<T>{
@@ -147,11 +147,11 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
   auto chrono_rhs_col =
     generate_timestamps<T>(this->size(), time_point_ms(start_rhs), time_point_ms(stop_rhs));
 
-  rmm::device_uvector<int32_t> indices(this->size(), cudf::default_stream_value);
-  thrust::sequence(rmm::exec_policy(cudf::default_stream_value), indices.begin(), indices.end());
+  rmm::device_uvector<int32_t> indices(this->size(), cudf::get_default_stream());
+  thrust::sequence(rmm::exec_policy(cudf::get_default_stream()), indices.begin(), indices.end());
 
   EXPECT_TRUE(thrust::all_of(
-    rmm::exec_policy(cudf::default_stream_value),
+    rmm::exec_policy(cudf::get_default_stream()),
     indices.begin(),
     indices.end(),
     compare_chrono_elements<TypeParam>{cudf::binary_operator::LESS,
@@ -159,7 +159,7 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
                                        *cudf::column_device_view::create(chrono_rhs_col)}));
 
   EXPECT_TRUE(thrust::all_of(
-    rmm::exec_policy(cudf::default_stream_value),
+    rmm::exec_policy(cudf::get_default_stream()),
     indices.begin(),
     indices.end(),
     compare_chrono_elements<TypeParam>{cudf::binary_operator::GREATER,
@@ -167,7 +167,7 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
                                        *cudf::column_device_view::create(chrono_lhs_col)}));
 
   EXPECT_TRUE(thrust::all_of(
-    rmm::exec_policy(cudf::default_stream_value),
+    rmm::exec_policy(cudf::get_default_stream()),
     indices.begin(),
     indices.end(),
     compare_chrono_elements<TypeParam>{cudf::binary_operator::LESS_EQUAL,
@@ -175,7 +175,7 @@ TYPED_TEST(ChronoColumnTest, ChronosCanBeComparedInDeviceCode)
                                        *cudf::column_device_view::create(chrono_lhs_col)}));
 
   EXPECT_TRUE(thrust::all_of(
-    rmm::exec_policy(cudf::default_stream_value),
+    rmm::exec_policy(cudf::get_default_stream()),
     indices.begin(),
     indices.end(),
     compare_chrono_elements<TypeParam>{cudf::binary_operator::GREATER_EQUAL,
diff --git a/dependencies.yaml b/dependencies.yaml
new file mode 100644
index 00000000000..074ef3696c4
--- /dev/null
+++ b/dependencies.yaml
@@ -0,0 +1,207 @@
+# Dependency list for https://github.com/rapidsai/dependency-file-generator
+files:
+  all:
+    output: conda
+    matrix:
+      cuda: ["11.5"]
+      arch: [x86_64]
+    includes:
+      - build
+      - cudatoolkit
+      - develop
+      - doc
+      - notebook
+      - run
+      - test_python
+  test_cpp:
+    output: none
+    includes:
+      - cudatoolkit
+  test_python:
+    output: none
+    includes:
+      - cudatoolkit
+      - py_version
+      - test_python
+  checks:
+    output: none
+    includes:
+      - build
+      - develop
+      - py_version
+channels:
+  - rapidsai
+  - rapidsai-nightly
+  - dask/label/dev
+  - conda-forge
+  - nvidia
+dependencies:
+  build:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - cmake>=3.23.1,!=3.25.0
+          - cuda-python>=11.7.1,<12.0
+          - cython>=0.29,<0.30
+          - dlpack>=0.5,<0.6.0a0
+          - pyarrow=9.0.0
+          - rmm=22.12.*
+          - scikit-build>=0.13.1
+      - output_types: conda
+        packages:
+          - arrow-cpp=9
+          - c-compiler
+          - cxx-compiler
+          - librdkafka=1.7.0
+          - protobuf>=3.20.1,<3.21.0a0
+          - python>=3.8,<3.10
+    specific:
+      - output_types: conda
+        matrices:
+        - matrix:
+            arch: x86_64
+          packages:
+            - gcc_linux-64=9.*
+            - sysroot_linux-64==2.17
+        - matrix:
+            arch: aarch64
+          packages:
+            - gcc_linux-aarch64=9.*
+            - sysroot_linux-aarch64==2.17
+      - output_types: conda
+        matrices:
+        - matrix:
+            arch: x86_64
+            cuda: "11.5"
+          packages:
+            - nvcc_linux-64=11.5
+        - matrix:
+            arch: aarch64
+            cuda: "11.5"
+          packages:
+            - nvcc_linux-aarch64=11.5
+  cudatoolkit:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              cuda: "11.2"
+            packages:
+              - cudatoolkit=11.2
+          - matrix:
+              cuda: "11.4"
+            packages:
+              - cudatoolkit=11.4
+          - matrix:
+              cuda: "11.5"
+            packages:
+              - cudatoolkit=11.5
+  develop:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - pre-commit
+      - output_types: conda
+        packages:
+          - doxygen=1.8.20  # pre-commit hook needs a specific version.
+  doc:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - myst-nb
+          - nbsphinx
+          - numpydoc
+          - pandoc<=2.0.0  # We should check and fix all "<=" pinnings
+          - pydata-sphinx-theme
+          - sphinx
+          - sphinx-autobuild
+          - sphinx-copybutton
+          - sphinx-markdown-tables
+          - sphinxcontrib-websupport
+  notebook:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - ipython
+          - notebook>=0.5.0
+  py_version:
+    specific:
+      - output_types: conda
+        matrices:
+          - matrix:
+              py: "3.8"
+            packages:
+              - python=3.8
+          - matrix:
+              py: "3.9"
+            packages:
+              - python=3.9
+  run:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - cachetools
+          - dask==2022.11.1
+          - distributed==2022.11.1
+          - fsspec>=0.6.0
+          - numba>=0.56.2
+          - numpy
+          - nvtx>=0.2.1
+          - packaging
+          - pandas>=1.0,<1.6.0dev0
+          - python-confluent-kafka=1.7.0
+          - streamz
+          - typing_extensions
+      - output_types: conda
+        packages:
+          - cubinlinker
+          - cupy>=9.5.0,<12.0.0a0
+          - pip
+          - pip:
+              - git+https://github.com/python-streamz/streamz.git@master
+              - pyorc
+          - ptxcompiler
+          - rmm=22.12.*
+      - output_types: requirements
+        packages:
+          # pip recognizes the index as a global option for the requirements.txt file
+          # This index is needed for rmm, cubinlinker, ptxcompiler.
+          - --extra-index-url=https://pypi.ngc.nvidia.com
+          - cubinlinker-cu11
+          - git+https://github.com/python-streamz/streamz.git@master
+          - ptxcompiler-cu11
+          - pyorc
+          - rmm-cu11=22.12.*
+    specific:
+      - output_types: requirements
+        matrices:
+          - matrix:
+              arch: x86_64
+            packages:
+              - cupy-cuda115>=9.5.0,<12.0.0a0  # TODO: This might change to cupy-cuda11x?
+          - matrix:
+              arch: aarch64
+            packages:
+              - cupy-cuda11x -f https://pip.cupy.dev/aarch64  # TODO: Verify that this works.
+  test_python:
+    common:
+      - output_types: [conda, requirements]
+        packages:
+          - aiobotocore>=2.2.0
+          - boto3>=1.21.21
+          - botocore>=1.24.21
+          - dask-cuda=22.12.*
+          - fastavro>=0.22.9
+          - hypothesis
+          - mimesis>=4.1.0
+          - moto>=4.0.8
+          - pytest
+          - pytest-benchmark
+          - pytest-cases
+          - pytest-cov
+          - pytest-xdist
+          - python-snappy>=0.6.0
+          - pytorch<1.12.0  # We should check and fix all "<=" pinnings
+          - s3fs>=2022.3.0
+          - scipy
+          - transformers
diff --git a/docs/cudf/README.md b/docs/cudf/README.md
index 6d07ec561bf..004f1998966 100644
--- a/docs/cudf/README.md
+++ b/docs/cudf/README.md
@@ -2,5 +2,5 @@
 
 This directory contains the documentation of cuDF Python.
 For more information on how to write, build, and read the documentation,
-see 
+see
 [the developer documentation](https://github.com/rapidsai/cudf/blob/HEAD/docs/cudf/source/developer_guide/documentation.md).
diff --git a/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst b/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst
index f86822bc567..a9c9bd2b650 100644
--- a/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst
+++ b/docs/cudf/source/_templates/autosummary/class_with_autosummary.rst
@@ -30,4 +30,4 @@
       {%- endfor %}
 
 {% endif %}
-{% endblock %}
\ No newline at end of file
+{% endblock %}
diff --git a/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst b/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst
index b57a7ceebb0..6676c672b20 100644
--- a/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst
+++ b/docs/cudf/source/_templates/autosummary/class_without_autosummary.rst
@@ -3,4 +3,4 @@
 
 .. currentmodule:: {{ module }}
 
-.. autoclass:: {{ objname }}
\ No newline at end of file
+.. autoclass:: {{ objname }}
diff --git a/docs/cudf/source/api_docs/dataframe.rst b/docs/cudf/source/api_docs/dataframe.rst
index bd868e85cc7..db3bde8cca0 100644
--- a/docs/cudf/source/api_docs/dataframe.rst
+++ b/docs/cudf/source/api_docs/dataframe.rst
@@ -179,6 +179,7 @@ Reindexing / selection / label manipulation
    DataFrame.tail
    DataFrame.take
    DataFrame.tile
+   DataFrame.truncate
 
 .. _api.dataframe.missing:
 
@@ -210,6 +211,7 @@ Reshaping, sorting, transposing
    DataFrame.interleave_columns
    DataFrame.partition_by_hash
    DataFrame.pivot
+   DataFrame.pivot_table
    DataFrame.scatter_by_map
    DataFrame.sort_values
    DataFrame.sort_index
@@ -248,10 +250,12 @@ Serialization / IO / conversion
    :toctree: api/
 
    DataFrame.from_arrow
+   DataFrame.from_dict
    DataFrame.from_pandas
    DataFrame.from_records
    DataFrame.hash_values
    DataFrame.to_arrow
+   DataFrame.to_dict
    DataFrame.to_dlpack
    DataFrame.to_parquet
    DataFrame.to_csv
diff --git a/docs/cudf/source/api_docs/general_functions.rst b/docs/cudf/source/api_docs/general_functions.rst
index 272d95e84bc..40e1b766dc9 100644
--- a/docs/cudf/source/api_docs/general_functions.rst
+++ b/docs/cudf/source/api_docs/general_functions.rst
@@ -14,6 +14,8 @@ Data manipulations
    cudf.get_dummies
    cudf.melt
    cudf.pivot
+   cudf.pivot_table
+   cudf.crosstab
    cudf.unstack
 
 Top-level conversions
diff --git a/docs/cudf/source/api_docs/groupby.rst b/docs/cudf/source/api_docs/groupby.rst
index 141e5adba93..701676a1779 100644
--- a/docs/cudf/source/api_docs/groupby.rst
+++ b/docs/cudf/source/api_docs/groupby.rst
@@ -53,6 +53,7 @@ Computations / descriptive stats
    GroupBy.mean
    GroupBy.median
    GroupBy.min
+   GroupBy.ngroup
    GroupBy.nth
    GroupBy.pad
    GroupBy.prod
@@ -62,7 +63,7 @@ Computations / descriptive stats
    GroupBy.var
    GroupBy.corr
    GroupBy.cov
-   
+
 The following methods are available in both ``SeriesGroupBy`` and
 ``DataFrameGroupBy`` objects, but may differ slightly, usually in that
 the ``DataFrameGroupBy`` version usually permits the specification of an
diff --git a/docs/cudf/source/api_docs/index_objects.rst b/docs/cudf/source/api_docs/index_objects.rst
index 8e0e3bbd411..46b3e864e35 100644
--- a/docs/cudf/source/api_docs/index_objects.rst
+++ b/docs/cudf/source/api_docs/index_objects.rst
@@ -262,12 +262,15 @@ Time/date components
    DatetimeIndex.hour
    DatetimeIndex.minute
    DatetimeIndex.second
-   DatetimeIndex.dayofweek
-   DatetimeIndex.dayofyear
+   DatetimeIndex.microsecond
+   DatetimeIndex.nanosecond
    DatetimeIndex.day_of_year
+   DatetimeIndex.dayofyear
+   DatetimeIndex.dayofweek
    DatetimeIndex.weekday
-   DatetimeIndex.is_leap_year
    DatetimeIndex.quarter
+   DatetimeIndex.is_leap_year
+
    DatetimeIndex.isocalendar
 
 Time-specific operations
diff --git a/docs/cudf/source/api_docs/series.rst b/docs/cudf/source/api_docs/series.rst
index 53042041f6d..0b2a58b2f87 100644
--- a/docs/cudf/source/api_docs/series.rst
+++ b/docs/cudf/source/api_docs/series.rst
@@ -177,6 +177,7 @@ Reindexing / selection / label manipulation
    Series.take
    Series.tail
    Series.tile
+   Series.truncate
    Series.where
    Series.mask
 
@@ -260,25 +261,27 @@ Datetime properties
 .. autosummary::
    :toctree: api/
 
+   year
+   month
    day
-   dayofweek
-   dayofyear
-   days_in_month
-   day_of_year
    hour
    minute
-   month
    second
+   microsecond
+   nanosecond
+   dayofweek
    weekday
-   year
-   is_leap_year
+   dayofyear
+   day_of_year
+   quarter
    is_month_start
    is_month_end
    is_quarter_start
    is_quarter_end
    is_year_start
    is_year_end
-   quarter
+   is_leap_year
+   days_in_month
 
 Datetime methods
 ^^^^^^^^^^^^^^^^
@@ -286,11 +289,11 @@ Datetime methods
 .. autosummary::
    :toctree: api/
 
-   strftime
    isocalendar
-   ceil
-   floor
+   strftime
    round
+   floor
+   ceil
 
 
 Timedelta properties
@@ -300,11 +303,11 @@ Timedelta properties
 .. autosummary::
    :toctree: api/
 
-   components
    days
+   seconds
    microseconds
    nanoseconds
-   seconds
+   components
 
 .. _api.series.str:
 .. include:: string_handling.rst
@@ -365,6 +368,7 @@ Serialization / IO / conversion
 
    Series.to_arrow
    Series.to_cupy
+   Series.to_dict
    Series.to_dlpack
    Series.to_frame
    Series.to_hdf
diff --git a/docs/cudf/source/api_docs/string_handling.rst b/docs/cudf/source/api_docs/string_handling.rst
index 1496d68db6f..57a09dee8e2 100644
--- a/docs/cudf/source/api_docs/string_handling.rst
+++ b/docs/cudf/source/api_docs/string_handling.rst
@@ -28,6 +28,7 @@ strings and apply several methods to it. These can be accessed like
    filter_tokens
    find
    findall
+   find_multiple
    get
    get_json_object
    hex_to_int
@@ -94,4 +95,3 @@ strings and apply several methods to it. These can be accessed like
    url_encode
    wrap
    zfill
-   
diff --git a/docs/cudf/source/api_docs/subword_tokenize.rst b/docs/cudf/source/api_docs/subword_tokenize.rst
index e8737a9ee0a..fc814bcb92a 100644
--- a/docs/cudf/source/api_docs/subword_tokenize.rst
+++ b/docs/cudf/source/api_docs/subword_tokenize.rst
@@ -8,5 +8,5 @@ Constructor
 .. autosummary::
    :toctree: api/
    :template: autosummary/class_with_autosummary.rst
-   
+
    SubwordTokenizer
diff --git a/docs/cudf/source/api_docs/window.rst b/docs/cudf/source/api_docs/window.rst
index 36857cbde07..6d96cc3e4bf 100644
--- a/docs/cudf/source/api_docs/window.rst
+++ b/docs/cudf/source/api_docs/window.rst
@@ -23,4 +23,3 @@ Rolling window functions
    Rolling.min
    Rolling.max
    Rolling.apply
-
diff --git a/docs/cudf/source/conf.py b/docs/cudf/source/conf.py
index 8eee4c13b40..ec5b1bd2aac 100644
--- a/docs/cudf/source/conf.py
+++ b/docs/cudf/source/conf.py
@@ -85,9 +85,9 @@
 # built documents.
 #
 # The short X.Y version.
-version = '22.10'
+version = '22.12'
 # The full version, including alpha/beta/rc tags.
-release = '22.10.01'
+release = '22.12.00'
 
 # The language for content autogenerated by Sphinx. Refer to documentation
 # for a list of supported languages.
diff --git a/docs/cudf/source/developer_guide/benchmarking.md b/docs/cudf/source/developer_guide/benchmarking.md
index 9370bde8c1e..dd0e1aca0c5 100644
--- a/docs/cudf/source/developer_guide/benchmarking.md
+++ b/docs/cudf/source/developer_guide/benchmarking.md
@@ -93,7 +93,7 @@ To satisfy these requirements, one must follow these rules when writing benchmar
    This enables running the benchmarks in "test" mode on small datasets, which will be much faster.
 
 
-### Writing benchmarks 
+### Writing benchmarks
 
 Just as benchmarks should be written in terms of the highest level classes in the hierarchy,
 they should also assume as little as possible about the nature of the data.
diff --git a/docs/cudf/source/developer_guide/contributing_guide.md b/docs/cudf/source/developer_guide/contributing_guide.md
index 1126e5c110a..d78b576320e 100644
--- a/docs/cudf/source/developer_guide/contributing_guide.md
+++ b/docs/cudf/source/developer_guide/contributing_guide.md
@@ -15,7 +15,7 @@ Developers are strongly recommended to set up `pre-commit` prior to any developm
 The `.pre-commit-config.yaml` file at the root of the repo is the primary source of truth linting.
 Specifically, cuDF uses the following tools:
 
-- [`flake8`](https://github.com/pycqa/flake8) checks for general code formatting compliance. 
+- [`flake8`](https://github.com/pycqa/flake8) checks for general code formatting compliance.
 - [`black`](https://github.com/psf/black) is an automatic code formatter.
 - [`isort`](https://pycqa.github.io/isort/) ensures imports are sorted consistently.
 - [`mypy`](http://mypy-lang.org/) performs static type checking.
diff --git a/docs/cudf/source/developer_guide/documentation.md b/docs/cudf/source/developer_guide/documentation.md
index b2c66ed43c5..187934cd274 100644
--- a/docs/cudf/source/developer_guide/documentation.md
+++ b/docs/cudf/source/developer_guide/documentation.md
@@ -4,7 +4,7 @@ cuDF documentation is split into multiple pieces.
 All core functionality is documented using inline docstrings.
 Additional pages like user or developer guides are written independently.
 While docstrings are written using [reStructuredText](https://www.sphinx-doc.org/en/master/usage/restructuredtext/basics.html) (reST),
-the latter are written using [MyST](https://myst-parser.readthedocs.io/en/latest/) 
+the latter are written using [MyST](https://myst-parser.readthedocs.io/en/latest/)
 The inline docstrings are organized using a small set of additional reST pages.
 The results are all then compiled together using [Sphinx](https://www.sphinx-doc.org/en/master/).
 This document discusses each of these components and how to contribute to them.
@@ -38,7 +38,7 @@ class A:
         ----------
         bar : str
             Description of bar.
-        
+
         Returns
         -------
         float
@@ -167,7 +167,7 @@ so links should make use of the appropriately namespaced anchors for links rathe
 The following are required to build the documentation:
 - A RAPIDS-compatible GPU. This is necessary because the documentation execute code.
 - A working copy of cudf in the same build environment.
-  We recommend following the [build instructions](https://github.com/rapidsai/cudf/blob/main/CONTRIBUTING.md#setting-up-your-build-environment). 
+  We recommend following the [build instructions](https://github.com/rapidsai/cudf/blob/main/CONTRIBUTING.md#setting-up-your-build-environment).
 - Sphinx, numpydoc, and MyST-NB.
   Assuming you follow the build instructions, these should automatically be installed into your environment.
 
@@ -190,7 +190,7 @@ Alternatively, you may specify a port with `python -m http.server $PORT`.
 You may build docs on a remote machine but want to view them locally.
 Assuming the other machine's IP address is visible on your local network,
 you can view the docs by replacing `localhost` with the IP address of the host machine.
-Alternatively, you may also forward the port using e.g. 
+Alternatively, you may also forward the port using e.g.
 `ssh -N -f -L localhost:$LOCAL_PORT:localhost:$REMOTE_PORT $REMOTE_IP`.
 That will make `$REMOTE_IP:$REMOTE_PORT` visible at `localhost:$LOCAL_PORT`.
 
diff --git a/docs/cudf/source/developer_guide/index.md b/docs/cudf/source/developer_guide/index.md
index 04f2bf6b8c0..4c4f4c0ad49 100644
--- a/docs/cudf/source/developer_guide/index.md
+++ b/docs/cudf/source/developer_guide/index.md
@@ -23,6 +23,7 @@ Additionally, it includes longer sections on more specific topics like testing a
 library_design
 contributing_guide
 documentation
+testing
 benchmarking
 options
 ```
diff --git a/docs/cudf/source/developer_guide/library_design.md b/docs/cudf/source/developer_guide/library_design.md
index 457ae6a39ff..ac3ace20ba3 100644
--- a/docs/cudf/source/developer_guide/library_design.md
+++ b/docs/cudf/source/developer_guide/library_design.md
@@ -27,24 +27,24 @@ Finally we tie these pieces together to provide a more holistic view of the proj
 % class RangeIndex
 % class DataFrame
 % class Series
-% 
+%
 % Frame <|-- IndexedFrame
-% 
+%
 % Frame <|-- SingleColumnFrame
-% 
+%
 % SingleColumnFrame <|-- Series
 % IndexedFrame <|-- Series
-% 
+%
 % IndexedFrame <|-- DataFrame
-% 
+%
 % BaseIndex <|-- RangeIndex
-% 
+%
 % BaseIndex <|-- MultiIndex
 % Frame <|-- MultiIndex
-% 
+%
 % BaseIndex <|-- GenericIndex
 % SingleColumnFrame <|-- GenericIndex
-% 
+%
 % @enduml
 
 
@@ -203,7 +203,6 @@ For instance, all numerical types (floats and ints of different widths) are all
 
 ### Buffer
 
-
 `Column`s are in turn composed of one or more `Buffer`s.
 A `Buffer` represents a single, contiguous, device memory allocation owned by another object.
 A `Buffer` constructed from a preexisting device memory allocation (such as a CuPy array) will view that memory.
@@ -212,6 +211,38 @@ Conversely, when constructed from a host object,
 The data is then copied from the host object into the newly allocated device memory.
 You can read more about [device memory allocation with RMM here](https://github.com/rapidsai/rmm).
 
+
+### Spilling to host memory
+
+Setting the environment variable `CUDF_SPILL=on` enables automatic spilling (and "unspilling") of buffers from
+device to host to enable out-of-memory computation, i.e., computing on objects that occupy more memory than is
+available on the GPU.
+
+
+Spilling can be enabled in two ways (it is disabled by default):
+  - setting the environment variable `CUDF_SPILL=on`, or
+  - setting the `spill` option in `cudf` by doing `cudf.set_option("spill", True)`.
+
+Additionally, parameters are:
+  - `CUDF_SPILL_ON_DEMAND=ON` / `cudf.set_option("spill_on_demand", True)`, which registers an RMM out-of-memory error handler that spills buffers in order to free up memory.
+  - `CUDF_SPILL_DEVICE_LIMIT=...` / `cudf.set_option("spill_device_limit", ...)`, which sets a device memory limit in bytes.
+
+
+#### Design
+
+Spilling consists of two components:
+  - A new buffer sub-class, `SpillableBuffer`, that implements moving of its data from host to device memory in-place.
+  - A spill manager that tracks all instances of `SpillableBuffer` and spills them on demand.
+A global spill manager is used throughout cudf when spilling is enabled, which makes `as_buffer()` return `SpillableBuffer` instead of the default `Buffer` instances.
+
+Accessing `Buffer.ptr`, we get the device memory pointer of the buffer. This is unproblematic in the case of `Buffer` but what happens when accessing `SpillableBuffer.ptr`, which might have spilled its device memory. In this case, `SpillableBuffer` needs to unspill the memory before returning its device memory pointer. Furthermore, while this device memory pointer is being used (or could be used), `SpillableBuffer`  cannot spill its memory back to host memory because doing so would invalidate the device pointer.
+
+To address this, we mark the `SpillableBuffer` as unspillable, we say that the buffer has been _exposed_. This can either be permanent if the device pointer is exposed to external projects or temporary while `libcudf` accesses the device memory.
+
+The `SpillableBuffer.get_ptr()` returns the device pointer of the buffer memory just like `.ptr` but if given an instance of `SpillLock`, the buffer is only unspillable as long as the instance of `SpillLock` is alive.
+
+For convenience, one can use the decorator/context `with_spill_lock` to associate a `SpillLock` with a lifetime bound to the context automatically.
+
 ## The Cython layer
 
 The lowest level of cuDF is its interaction with `libcudf` via Cython.
diff --git a/docs/cudf/source/developer_guide/testing.md b/docs/cudf/source/developer_guide/testing.md
new file mode 100644
index 00000000000..e64e5fbd0a9
--- /dev/null
+++ b/docs/cudf/source/developer_guide/testing.md
@@ -0,0 +1,150 @@
+# Testing cuDF
+
+## Tooling
+Tests in cuDF are written using [`pytest`](https://docs.pytest.org/en/latest/).
+Test coverage is measured using [`coverage.py`](https://coverage.readthedocs.io/en/latest/),
+specifically the [`pytest-cov`](https://github.com/pytest-dev/pytest-cov) plugin.
+Code coverage reports are uploaded to [Codecov](https://app.codecov.io/gh/rapidsai/cudf).
+Each PR also indicates whether it increases or decreases test coverage.
+
+## Test organization
+
+How tests are organized depends on which of the following two groups they fall into:
+
+1. Free functions such as `cudf.merge` that operate on classes like `DataFrame` or `Series`.
+2. Methods of the above classes.
+
+Tests of free functions should be grouped into files based on the
+[API sections in the documentation](https://docs.rapids.ai/api/cudf/latest/api_docs/index.html).
+This places tests of similar functionality in the same module.
+Tests of class methods should be organized in the same way, except that this organization should be within a subdirectory corresponding to the class.
+For instance, tests of `DataFrame` indexing should be placed into `dataframe/test_indexing.py`.
+In cases where tests may be shared by multiple classes sharing a common parent (e.g. `DataFrame` and `Series` both require `IndexedFrame` tests),
+the tests may be placed in a directory corresponding to the parent class.
+
+## Test contents
+
+### Writing tests
+
+In general, functionality must be tested for both standard and exceptional cases.
+Standard use cases may be covered using parametrization (using `pytest.mark.parametrize`).
+Tests of standard use cases should typically include some coverage of:
+- Different dtypes, including nested dtypes (especially strings)
+- Mixed objects, e.g. binary operations between `DataFrame` and `Series`
+- Operations on scalars
+- Verifying all combinations of parameters for complex APIs like `cudf.merge`.
+
+Here are some of the most common exceptional cases to test:
+1. `Series`/`DataFrame`/`Index` with zero rows
+2. `DataFrame` with zero columns
+3. All null data
+4. For string or list APIs, empty strings/lists
+5. For list APIs, lists containing all null elements or empty strings
+6. For numeric data:
+  1. All 0s.
+  2. All 1s.
+  3. Containing/all inf
+  4. Containing/all nan
+  5. `INT${PRECISION}_MAX` for a given precision (e.g. `2**32` for `int32`).
+
+Most specific APIs will also include a range of other cases.
+
+In general, it is preferable to write separate tests for different exceptional cases.
+Excessive parametrization and branching increases complexity and obfuscates the purpose of a test.
+Typically, exception cases require specific assertions or other special logic, so they are best kept separate.
+The main exception to this rule is tests based on comparison to pandas.
+Such tests may test exceptional cases alongside more typical cases since the logic is generally identical.
+
+### Parametrization: custom fixtures and `pytest.mark.parametrize`
+
+When it comes to parametrizing tests written with `pytest`,
+the two main options are [fixtures](https://docs.pytest.org/en/latest/explanation/fixtures.html)
+and [`mark.parametrize`](https://docs.pytest.org/en/latest/how-to/parametrize.html#pytest-mark-parametrize).
+By virtue of being functions, fixtures are both more verbose and more self-documenting.
+Fixtures also have the significant benefit of being constructed lazily,
+whereas parametrizations are constructed at test collection time.
+
+In general, these approaches are applicable to parametrizations of different complexity.
+For the purpose of this discussion,
+we define a parametrization as "simple" if it is composed of a list (possibly nested) of primitive objects.
+Examples include a list of integers or a list of list of strings.
+This _does not_ include e.g. cuDF or pandas objects.
+In particular, developers should avoid performing GPU memory allocations during test collection.
+
+With that in mind, here are some ground rules for how to parametrize.
+
+Use `pytest.mark.parametrize` when:
+- One test must be run on many inputs and those inputs are simple to construct.
+
+Use fixtures when:
+- One or more tests must be run on the same set of inputs,
+  and all of those inputs can be constructed with simple parametrizations.
+  In practice, that means that it is acceptable to use a fixture like this:
+  ```python
+      @pytest.fixture(params=["a", "b"])
+      def foo(request):
+          if request.param == "a":
+              # Some complex initialization
+          elif request.param == "b":
+              # Some other complex initialization
+  ```
+  In other words, the construction of the fixture may be complex,
+  as long as the parametrization of that construction is simple.
+- One or more tests must be run on the same set of inputs,
+  and at least one of those inputs requires complex parametrizations.
+  In this case, the parametrization of a fixture should be decomposed
+  by using fixtures that depend on other fixtures.
+  ```python
+      @pytest.fixture(params=["a", "b"])
+      def foo(request):
+          if request.param == "a":
+              # Some complex initialization
+          elif request.param == "b":
+              # Some other complex initialization
+
+      @pytest.fixture
+      def bar(foo):
+         # do something with foo like initialize a cudf object.
+
+      def test_some_property(bar):
+          # will be run for each value of bar that results from each value of foo.
+          assert some_property_of(bar)
+  ```
+
+#### Complex parametrizations
+
+The lists above document common use cases.
+However, more complex cases may arise.
+One of the most common alternatives is where, given a set of test cases,
+different tests need to run on different subsets with a nonempty intersection.
+Fixtures and parametrization are only capable of handling the Cartesian product of parameters,
+i.e. "run this test for all values of `a` and all values of `b`".
+
+There are multiple potential solutions to this problem.
+One possibility is to encapsulate common test logic in a helper function,
+then call it from multiple `test_*` functions that construct the necessary inputs.
+Another possibility is to use functions rather than fixtures to construct inputs, allowing for more flexible input construction:
+```python
+def get_values(predicate):
+    values = range(10)
+    yield from filter(predicate, values)
+
+def test_evens():
+    for v in get_values(lambda x: x % 2 == 0):
+        # Execute test
+
+def test_odds():
+    for v in get_values(lambda x: x % 2 == 1):
+        # Execute test
+```
+
+Other approaches are also possible, and the best solution should be discussed on a case-by-case basis during PR review.
+
+### Testing utility functions
+
+The `cudf.testing` subpackage provides a handful of utilities for testing the equality of objects.
+The internal `cudf.testing._utils` module provides additional helper functions for use in tests.
+In particular:
+- `testing._utils.assert_eq` is the biggest hammer to reach for. It can be used to compare any pair of objects.
+- For comparing specific objects, use `testing.testing.assert_[frame|series|index]_equal`.
+- For verifying that the expected assertions are raised, use `testing._utils.assert_exceptions_equal`.
diff --git a/docs/cudf/source/user_guide/10min.ipynb b/docs/cudf/source/user_guide/10min.ipynb
index b9278151e64..870e334c216 100644
--- a/docs/cudf/source/user_guide/10min.ipynb
+++ b/docs/cudf/source/user_guide/10min.ipynb
@@ -15,7 +15,7 @@
     "\n",
     "[Dask](https://dask.org/) is a flexible library for parallel computing in Python that makes scaling out your workflow smooth and simple. On the CPU, Dask uses Pandas to execute operations in parallel on DataFrame partitions.\n",
     "\n",
-    "[Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed by cuDF GPU DataFrames as opposed to Pandas DataFrames. For instance, when you call dask_cudf.read_csv(...), your cluster’s GPUs do the work of parsing the CSV file(s) with underlying cudf.read_csv().\n",
+    "[Dask-cuDF](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf) extends Dask where necessary to allow its DataFrame partitions to be processed by cuDF GPU DataFrames as opposed to Pandas DataFrames. For instance, when you call dask_cudf.read_csv(...), your cluster's GPUs do the work of parsing the CSV file(s) with underlying cudf.read_csv().\n",
     "\n",
     "\n",
     "### When to use cuDF and Dask-cuDF\n",
@@ -5474,7 +5474,7 @@
    "cell_type": "markdown",
    "metadata": {},
    "source": [
-    "Writing to parquet files, using the CPU via PyArrow."
+    "Writing to parquet files with GPU-accelerated parquet writer"
    ]
   },
   {
@@ -5749,7 +5749,7 @@
     }
    ],
    "source": [
-    "ddf.to_parquet('example_files')  "
+    "ddf.to_parquet('example_output/ddf_parquet_files')"
    ]
   },
   {
diff --git a/docs/cudf/source/user_guide/cudf.CategoricalDtype.rst b/docs/cudf/source/user_guide/cudf.CategoricalDtype.rst
new file mode 100644
index 00000000000..808c20e0750
--- /dev/null
+++ b/docs/cudf/source/user_guide/cudf.CategoricalDtype.rst
@@ -0,0 +1,19 @@
+cudf.CategoricalDtype
+=====================
+
+.. currentmodule:: cudf
+
+.. autoclass:: CategoricalDtype
+   :members: categories, ordered, from_pandas, to_pandas
+
+
+
+..
+   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. autosummary::
+      :toctree:
+
+      CategoricalDtype.categories
+      CategoricalDtype.ordered
+      CategoricalDtype.from_pandas
+      CategoricalDtype.to_pandas
diff --git a/docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst b/docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst
new file mode 100644
index 00000000000..cada8fd6cb6
--- /dev/null
+++ b/docs/cudf/source/user_guide/cudf.Decimal128Dtype.rst
@@ -0,0 +1,20 @@
+cudf.Decimal128Dtype
+===================
+
+.. currentmodule:: cudf
+
+.. autoclass:: Decimal128Dtype
+   :members: precision, scale, itemsize, to_arrow, from_arrow
+
+
+
+..
+   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. autosummary::
+      :toctree:
+
+      Decimal128Dtype.precision
+      Decimal128Dtype.scale
+      Decimal128Dtype.itemsize
+      Decimal128Dtype.to_arrow
+      Decimal128Dtype.from_arrow
diff --git a/docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst b/docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst
new file mode 100644
index 00000000000..c4c65bb2d24
--- /dev/null
+++ b/docs/cudf/source/user_guide/cudf.Decimal32Dtype.rst
@@ -0,0 +1,20 @@
+cudf.Decimal32Dtype
+===================
+
+.. currentmodule:: cudf
+
+.. autoclass:: Decimal32Dtype
+   :members: precision, scale, itemsize, to_arrow, from_arrow
+
+
+
+..
+   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. autosummary::
+      :toctree:
+
+      Decimal32Dtype.precision
+      Decimal32Dtype.scale
+      Decimal32Dtype.itemsize
+      Decimal32Dtype.to_arrow
+      Decimal32Dtype.from_arrow
diff --git a/docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst b/docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst
new file mode 100644
index 00000000000..99305ade485
--- /dev/null
+++ b/docs/cudf/source/user_guide/cudf.Decimal64Dtype.rst
@@ -0,0 +1,20 @@
+cudf.Decimal64Dtype
+===================
+
+.. currentmodule:: cudf
+
+.. autoclass:: Decimal64Dtype
+   :members: precision, scale, itemsize, to_arrow, from_arrow
+
+
+
+..
+   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. autosummary::
+      :toctree:
+
+      Decimal64Dtype.precision
+      Decimal64Dtype.scale
+      Decimal64Dtype.itemsize
+      Decimal64Dtype.to_arrow
+      Decimal64Dtype.from_arrow
diff --git a/docs/cudf/source/user_guide/cudf.ListDtype.rst b/docs/cudf/source/user_guide/cudf.ListDtype.rst
new file mode 100644
index 00000000000..a9b5000e657
--- /dev/null
+++ b/docs/cudf/source/user_guide/cudf.ListDtype.rst
@@ -0,0 +1,19 @@
+cudf.ListDtype
+==============
+
+.. currentmodule:: cudf
+
+.. autoclass:: ListDtype
+   :members: element_type, leaf_type, from_arrow, to_arrow
+
+
+
+..
+   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. autosummary::
+      :toctree:
+
+      ListDtype.element_type
+      ListDtype.leaf_type
+      ListDtype.from_arrow
+      ListDtype.to_arrow
diff --git a/docs/cudf/source/user_guide/cudf.StructDtype.rst b/docs/cudf/source/user_guide/cudf.StructDtype.rst
new file mode 100644
index 00000000000..dd2a841dbe3
--- /dev/null
+++ b/docs/cudf/source/user_guide/cudf.StructDtype.rst
@@ -0,0 +1,18 @@
+cudf.StructDtype
+================
+
+.. currentmodule:: cudf
+
+.. autoclass:: StructDtype
+   :members: fields, from_arrow, to_arrow
+
+
+
+..
+   HACK -- the point here is that we don't want this to appear in the output, but the autosummary should still generate the pages.
+   .. autosummary::
+      :toctree:
+
+      StructDtype.fields
+      StructDtype.from_arrow
+      StructDtype.to_arrow
diff --git a/docs/cudf/source/user_guide/cupy-interop.ipynb b/docs/cudf/source/user_guide/cupy-interop.ipynb
index 9fbac3b2578..3e169984ace 100644
--- a/docs/cudf/source/user_guide/cupy-interop.ipynb
+++ b/docs/cudf/source/user_guide/cupy-interop.ipynb
@@ -42,7 +42,7 @@
     "\n",
     "2. We can also use `DataFrame.values`.\n",
     "\n",
-    "3. We can also convert via the [CUDA array interface](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) by using cuDF's `as_gpu_matrix` and CuPy's `asarray` functionality."
+    "3. We can also convert via the [CUDA array interface](https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html) by using cuDF's `to_cupy` functionality."
    ]
   },
   {
diff --git a/docs/cudf/source/user_guide/dask-cudf.md b/docs/cudf/source/user_guide/dask-cudf.md
deleted file mode 100644
index 2d829008ac9..00000000000
--- a/docs/cudf/source/user_guide/dask-cudf.md
+++ /dev/null
@@ -1,104 +0,0 @@
-# Multi-GPU with Dask-cuDF
-
-cuDF is a single-GPU library. For Multi-GPU cuDF solutions we use
-[Dask](https://dask.org/) and the [dask-cudf
-package](https://github.com/rapidsai/cudf/tree/main/python/dask_cudf),
-which is able to scale cuDF across multiple GPUs on a single machine,
-or multiple GPUs across many machines in a cluster.
-
-[Dask DataFrame](http://docs.dask.org/en/latest/dataframe.html) was
-originally designed to scale Pandas, orchestrating many Pandas
-DataFrames spread across many CPUs into a cohesive parallel DataFrame.
-Because cuDF currently implements only a subset of the Pandas API, not
-all Dask DataFrame operations work with cuDF.
-
-The following is tested and expected to work:
-
-## What works
-
-- Data ingestion
-
-  - `dask_cudf.read_csv`
-  - Use standard Dask ingestion with Pandas, then convert to cuDF (For
-    Parquet and other formats this is often decently fast)
-
-- Linear operations
-
-  - Element-wise operations: `df.x + df.y`, `df ** 2`
-  - Assignment: `df['z'] = df.x + df.y`
-  - Row-wise selections: `df[df.x > 0]`
-  - Loc: `df.loc['2001-01-01': '2005-02-02']`
-  - Date time/string accessors: `df.timestamp.dt.dayofweek`
-  - ... and most similar operations in this category that are already
-    implemented in cuDF
-
-- Reductions
-
-  - Like `sum`, `mean`, `max`, `count`, and so on on
-    `Series` objects
-  - Support for reductions on full dataframes
-  - `std`
-  - Custom reductions with
-    [dask.dataframe.reduction](https://docs.dask.org/en/latest/generated/dask.dataframe.Series.reduction.html)
-
-- Groupby aggregations
-
-  - On single columns: `df.groupby('x').y.max()`
-  - With custom aggregations:
-  - groupby standard deviation
-  - grouping on multiple columns
-  - groupby agg for multiple outputs
-
-- Joins:
-
-  - On full unsorted columns: `left.merge(right, on='id')`
-    (expensive)
-  - On sorted indexes:
-    `left.merge(right, left_index=True, right_index=True)` (fast)
-  - On large and small dataframes: `left.merge(cudf_df, on='id')`
-    (fast)
-
-- Rolling operations
-
-- Converting to and from other forms
-
-  - Dask + Pandas to Dask + cuDF
-    `df.map_partitions(cudf.DataFrame.from_pandas)`
-  - Dask + cuDF to Dask + Pandas
-    `df.map_partitions(lambda df: df.to_pandas())`
-  - cuDF to Dask + cuDF:
-    `dask.dataframe.from_pandas(df, npartitions=20)`
-  - Dask + cuDF to cuDF: `df.compute()`
-
-Additionally all generic Dask operations, like `compute`, `persist`,
-`visualize` and so on work regardless.
-
-## Developing the API
-
-Above we mention the following:
-
-> and most similar operations in this category that are already
-> implemented in cuDF
-
-This is because it is difficult to create a comprehensive list of
-operations in the cuDF and Pandas libraries. The API is large enough to
-be difficult to track effectively. For any operation that operates
-row-wise like `fillna` or `query` things will likely, but not
-certainly work. If operations don't work it is often due to a slight
-inconsistency between Pandas and cuDF that is generally easy to fix. We
-encourage users to look at the [cuDF issue
-tracker](https://github.com/rapidsai/cudf/issues) to see if their
-issue has already been reported and, if not, [raise a new
-issue](https://github.com/rapidsai/cudf/issues/new).
-
-## Navigating the API
-
-This project reuses the [Dask
-DataFrame](https://docs.dask.org/en/latest/dataframe.html) project,
-which was originally designed for Pandas, with the newer library cuDF.
-Because we use the same Dask classes for both projects there are often
-methods that are implemented for Pandas, but not yet for cuDF. As a
-result users looking at the full Dask DataFrame API can be misleading,
-and often lead to frustration when operations that are advertised in the
-Dask API do not work as expected with cuDF. We apologize for this in
-advance.
diff --git a/docs/cudf/source/user_guide/data-types.md b/docs/cudf/source/user_guide/data-types.md
index 664540b4592..ee75457e87d 100644
--- a/docs/cudf/source/user_guide/data-types.md
+++ b/docs/cudf/source/user_guide/data-types.md
@@ -9,18 +9,18 @@ All data types in cuDF are [nullable](missing-data).
 
 <div class="special-table">
 
-| Kind of data         | Data type(s)                                                                    |
-|----------------------|---------------------------------------------------------------------------------|
-| Signed integer       | `'int8'`, `'int16'`, `'int32'`, `'int64'`                                       |
-| Unsigned integer     | `'uint32'`, `'uint64'`                                                          |
-| Floating-point       | `'float32'`, `'float64'`                                                        |
-| Datetime             | `'datetime64[s]'`, `'datetime64[ms]'`, `'datetime64['us']`, `'datetime64[ns]'`  |
-| Timedelta (duration) | `'timedelta[s]'`, `'timedelta[ms]'`, `'timedelta['us']`, `'timedelta[ns]'`      |
-| Category             | `cudf.CategoricalDtype`                                                         |
-| String               | `'object'` or `'string'`                                                        |
-| Decimal              | `cudf.Decimal32Dtype`, `cudf.Decimal64Dtype`, `cudf.Decimal64Dtype`             |
-| List                 | `cudf.ListDtype`                                                                |
-| Struct               | `cudf.StructDtype`                                                              |
+| Kind of data         | Data type(s)                                                                                      |
+|----------------------|---------------------------------------------------------------------------------------------------|
+| Signed integer       | `'int8'`, `'int16'`, `'int32'`, `'int64'`                                                         |
+| Unsigned integer     | `'uint32'`, `'uint64'`                                                                            |
+| Floating-point       | `'float32'`, `'float64'`                                                                          |
+| Datetime             | `'datetime64[s]'`, `'datetime64[ms]'`, `'datetime64['us']`, `'datetime64[ns]'`                    |
+| Timedelta (duration) | `'timedelta[s]'`, `'timedelta[ms]'`, `'timedelta['us']`, `'timedelta[ns]'`                        |
+| Category             | {py:func}`cudf.CategoricalDtype`                                                                  |
+| String               | `'object'` or `'string'`                                                                          |
+| Decimal              | {py:func}`cudf.Decimal32Dtype`, {py:func}`cudf.Decimal64Dtype`, {py:func}`cudf.Decimal128Dtype`   |
+| List                 | {py:func}`cudf.ListDtype`                                                                         |
+| Struct               | {py:func}`cudf.StructDtype`                                                                       |
 
 </div>
 
@@ -46,7 +46,7 @@ dtype: float32
 The data type associated with string data in cuDF is `"np.object"`.
 
 ```python
->>> import cudf 
+>>> import cudf
 >>> s = cudf.Series(["abc", "def", "ghi"])
 >>> s.dtype
 dtype("object")
diff --git a/docs/cudf/source/user_guide/guide-to-udfs.ipynb b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
index f80644251c2..bd7793ac214 100644
--- a/docs/cudf/source/user_guide/guide-to-udfs.ipynb
+++ b/docs/cudf/source/user_guide/guide-to-udfs.ipynb
@@ -446,8 +446,8 @@
    "id": "00914f2a",
    "metadata": {},
    "source": [
-    "In addition to the Series.apply() method for performing custom operations, you can also pass Series objects directly into [CUDA kernels written with Numba](https://numba.pydata.org/numba-doc/latest/cuda/kernels.html).\n",
-    "Note that this section requires basic CUDA knowledge. Refer to [numba's CUDA documentation](https://numba.pydata.org/numba-doc/latest/cuda/index.html) for details.\n",
+    "In addition to the Series.apply() method for performing custom operations, you can also pass Series objects directly into [CUDA kernels written with Numba](https://numba.readthedocs.io/en/stable/cuda/kernels.html).\n",
+    "Note that this section requires basic CUDA knowledge. Refer to [numba's CUDA documentation](https://numba.readthedocs.io/en/stable/cuda/index.html) for details.\n",
     "\n",
     "The easiest way to write a Numba kernel is to use `cuda.grid(1)` to manage thread indices, and then leverage Numba's `forall` method to configure the kernel for us. Below, define a basic multiplication kernel as an example and use `@cuda.jit` to compile it."
    ]
@@ -485,7 +485,7 @@
    "source": [
     "This kernel will take an input array, multiply it by a configurable value (supplied at runtime), and store the result in an output array. Notice that we wrapped our logic in an `if` statement. Because we can launch more threads than the size of our array, we need to make sure that we don't use threads with an index that would be out of bounds. Leaving this out can result in undefined behavior.\n",
     "\n",
-    "To execute our kernel, must pre-allocate an output array and leverage the `forall` method mentioned above. First, we create a Series of all `0.0` in our DataFrame, since we want `float64` output. Next, we run the kernel with `forall`. `forall` requires us to specify our desired number of tasks, so we'll supply in the length of our Series (which we store in `size`). The [__cuda_array_interface__](https://numba.pydata.org/numba-doc/dev/cuda/cuda_array_interface.html) is what allows us to directly call our Numba kernel on our Series."
+    "To execute our kernel, must pre-allocate an output array and leverage the `forall` method mentioned above. First, we create a Series of all `0.0` in our DataFrame, since we want `float64` output. Next, we run the kernel with `forall`. `forall` requires us to specify our desired number of tasks, so we'll supply in the length of our Series (which we store in `size`). The [__cuda_array_interface__](https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html) is what allows us to directly call our Numba kernel on our Series."
    ]
   },
   {
@@ -2624,7 +2624,7 @@
     "- Generalized NA UDFs\n",
     "\n",
     "\n",
-    "For more information please see the [cuDF](https://docs.rapids.ai/api/cudf/nightly/), [Numba.cuda](https://numba.pydata.org/numba-doc/dev/cuda/index.html), and [CuPy](https://docs-cupy.chainer.org/en/stable/) documentation."
+    "For more information please see the [cuDF](https://docs.rapids.ai/api/cudf/nightly/), [Numba.cuda](https://numba.readthedocs.io/en/stable/cuda/index.html), and [CuPy](https://docs-cupy.chainer.org/en/stable/) documentation."
    ]
   }
  ],
diff --git a/docs/cudf/source/user_guide/index.md b/docs/cudf/source/user_guide/index.md
index d99056f69f2..86168f0d81b 100644
--- a/docs/cudf/source/user_guide/index.md
+++ b/docs/cudf/source/user_guide/index.md
@@ -11,7 +11,6 @@ missing-data
 groupby
 guide-to-udfs
 cupy-interop
-dask-cudf
 options
 PandasCompat
 ```
diff --git a/docs/cudf/source/user_guide/io.md b/docs/cudf/source/user_guide/io.md
index 9099a761f2c..3a803953502 100644
--- a/docs/cudf/source/user_guide/io.md
+++ b/docs/cudf/source/user_guide/io.md
@@ -170,7 +170,7 @@ If no value is set, behavior will be the same as the "STABLE" option.
     +=======================+========+========+==============+==============+=========+========+==============+==============+========+
     | Snappy                | ❌     | ❌     | Stable       | Stable       | ❌      | ❌     | Stable       | Stable       | ❌     |
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
-    | ZSTD                  | ❌     | ❌     | Experimental | Experimental | ❌      | ❌     | Experimental | Experimental | ❌     |
+    | ZSTD                  | ❌     | ❌     | Stable       | Stable       | ❌      | ❌     | Stable       | Stable       | ❌     |
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
     | DEFLATE               | ❌     | ❌     | ❌           | ❌           | ❌      | ❌     | Experimental | Experimental | ❌     |
     +-----------------------+--------+--------+--------------+--------------+---------+--------+--------------+--------------+--------+
diff --git a/docs/cudf/source/user_guide/missing-data.ipynb b/docs/cudf/source/user_guide/missing-data.ipynb
index ad12c675373..ac5bddd34cf 100644
--- a/docs/cudf/source/user_guide/missing-data.ipynb
+++ b/docs/cudf/source/user_guide/missing-data.ipynb
@@ -229,7 +229,7 @@
    "id": "acdf29d7",
    "metadata": {},
    "source": [
-    "One has to be mindful that in Python (and NumPy), the nan's don’t compare equal, but None's do. Note that cudf/NumPy uses the fact that `np.nan != np.nan`, and treats `None` like `np.nan`."
+    "One has to be mindful that in Python (and NumPy), the nan's don't compare equal, but None's do. Note that cudf/NumPy uses the fact that `np.nan != np.nan`, and treats `None` like `np.nan`."
    ]
   },
   {
@@ -279,7 +279,7 @@
    "id": "4fdb8bc7",
    "metadata": {},
    "source": [
-    "So as compared to above, a scalar equality comparison versus a None/np.nan doesn’t provide useful information."
+    "So as compared to above, a scalar equality comparison versus a None/np.nan doesn't provide useful information."
    ]
   },
   {
diff --git a/fetch_rapids.cmake b/fetch_rapids.cmake
index 9e2917ffc07..cc2e201fdc3 100644
--- a/fetch_rapids.cmake
+++ b/fetch_rapids.cmake
@@ -12,7 +12,7 @@
 # the License.
 # =============================================================================
 if(NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake)
-  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.10/RAPIDS.cmake
+  file(DOWNLOAD https://raw.githubusercontent.com/rapidsai/rapids-cmake/branch-22.12/RAPIDS.cmake
        ${CMAKE_CURRENT_BINARY_DIR}/CUDF_RAPIDS.cmake
   )
 endif()
diff --git a/java/README.md b/java/README.md
index 05a24c1d3d3..2d8e2190fee 100644
--- a/java/README.md
+++ b/java/README.md
@@ -34,7 +34,7 @@ most modern cuda drivers.
 </dependency>
 ```
 
-In some cases there may be a classifier to indicate the version of cuda required. See the 
+In some cases there may be a classifier to indicate the version of cuda required. See the
 [Build From Source](#build-from-source) section below for more information about when this
 can happen. No official release of the jar will have a classifier on it.
 
@@ -114,12 +114,12 @@ mvn clean install -DCUDF_USE_PER_THREAD_DEFAULT_STREAM=ON
 
 ## GPUDirect Storage (GDS)
 
-The JNI code can be built with *GPUDirect Storage* (GDS) support, which enables direct copying 
+The JNI code can be built with *GPUDirect Storage* (GDS) support, which enables direct copying
 between GPU device buffers and supported filesystems (see
 https://docs.nvidia.com/gpudirect-storage/).
 
 To enable GDS support, first make sure GDS is installed (see
-https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html), then run:  
+https://docs.nvidia.com/gpudirect-storage/troubleshooting-guide/index.html), then run:
 ```shell script
 cd src/cudf/java
 mvn clean install -DUSE_GDS=ON
diff --git a/java/ci/README.md b/java/ci/README.md
index d74c7b41157..a26fb1cba04 100644
--- a/java/ci/README.md
+++ b/java/ci/README.md
@@ -34,7 +34,7 @@ nvidia-docker run -it cudf-build:11.5.0-devel-centos7 bash
 You can download the cuDF repo in the docker container or you can mount it into the container.
 Here I choose to download again in the container.
 ```bash
-git clone --recursive https://github.com/rapidsai/cudf.git -b branch-22.10
+git clone --recursive https://github.com/rapidsai/cudf.git -b branch-22.12
 ```
 
 ### Build cuDF jar with devtoolset
@@ -47,5 +47,4 @@ scl enable devtoolset-9 "java/ci/build-in-docker.sh"
 
 ### The output
 
-You can find the cuDF jar in java/target/ like cudf-22.10.0-SNAPSHOT-cuda11.jar.
-
+You can find the cuDF jar in java/target/ like cudf-22.12.0-SNAPSHOT-cuda11.jar.
diff --git a/java/pom.xml b/java/pom.xml
index f2bb3def459..10d5c7ec360 100644
--- a/java/pom.xml
+++ b/java/pom.xml
@@ -21,7 +21,7 @@
 
     <groupId>ai.rapids</groupId>
     <artifactId>cudf</artifactId>
-    <version>22.10.0-SNAPSHOT</version>
+    <version>22.12.0-SNAPSHOT</version>
 
     <name>cudfjni</name>
     <description>
diff --git a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
index 2b1afb92e36..48a7861f1a1 100644
--- a/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
+++ b/java/src/main/java/ai/rapids/cudf/BinaryOperable.java
@@ -383,7 +383,7 @@ default ColumnVector and(BinaryOperable rhs) {
   }
 
   /**
-   * Logical or (||) with the given output type. this || rhs  
+   * Logical or (||) with the given output type. this || rhs
    */
   default ColumnVector or(BinaryOperable rhs, DType outType) {
     return binaryOp(BinaryOp.LOGICAL_OR, rhs, outType);
diff --git a/java/src/main/java/ai/rapids/cudf/ColumnView.java b/java/src/main/java/ai/rapids/cudf/ColumnView.java
index 8bc764a078e..57849b9ba0a 100644
--- a/java/src/main/java/ai/rapids/cudf/ColumnView.java
+++ b/java/src/main/java/ai/rapids/cudf/ColumnView.java
@@ -2632,12 +2632,13 @@ public final ColumnVector stringSplitRecord(String delimiter) {
 
   /**
    * Returns a new strings column that contains substrings of the strings in the provided column.
-   * Overloading subString to support if end index is not provided. Appending -1 to indicate to
-   * read until end of string.
+   * The character positions to retrieve in each string are `[start, <the string end>)`..
+   *
    * @param start first character index to begin the substring(inclusive).
    */
   public final ColumnVector substring(int start) {
-    return substring(start, -1);
+    assert type.equals(DType.STRING) : "column type must be a String";
+    return new ColumnVector(substringS(getNativeView(), start));
   }
 
   /**
@@ -3276,6 +3277,46 @@ public final ColumnVector extractAllRecord(String pattern, int idx) {
     return new ColumnVector(extractAllRecord(this.getNativeView(), pattern, idx));
   }
 
+  /**
+   * Returns a boolean ColumnVector identifying rows which
+   * match the given like pattern.
+   *
+   * The like pattern expects only 2 wildcard special characters
+   * - `%` any number of any character (including no characters)
+   * - `_` any single character
+   *
+   * ```
+   * cv = ["azaa", "ababaabba", "aaxa"]
+   * r = cv.like("%a_aa%", "\\")
+   * r is now [true, true, false]
+   * r = cv.like("a__a", "\\")
+   * r is now [true, false, true]
+   * ```
+   *
+   * The escape character is specified to include either `%` or `_` in the search,
+   * which is expected to be either 0 or 1 character.
+   * If more than one character is specified, only the first character is used.
+   *
+   * ```
+   * cv = ["abc_def", "abc1def", "abc_"]
+   * r = cv.like("abc/_d%", "/")
+   * r is now [true, false, false]
+   * ```
+   * Any null string entries return corresponding null output column entries.
+   *
+   * @param pattern Like pattern to match to each string.
+   * @param escapeChar Character specifies the escape prefix; default is "\\".
+   * @return New ColumnVector of boolean results for each string.
+   */
+  public final ColumnVector like(Scalar pattern, Scalar escapeChar) {
+    assert type.equals(DType.STRING) : "column type must be a String";
+    assert pattern != null : "pattern scalar must not be null";
+    assert pattern.getType().equals(DType.STRING) : "pattern scalar must be a string scalar";
+    assert escapeChar != null : "escapeChar scalar must not be null";
+    assert escapeChar.getType().equals(DType.STRING) : "escapeChar scalar must be a string scalar";
+    return new ColumnVector(like(getNativeView(), pattern.getScalarHandle(), escapeChar.getScalarHandle()));
+  }
+
 
   /**
    * Converts all character sequences starting with '%' into character code-points
@@ -3943,6 +3984,13 @@ private static native long stringSplitRecord(long nativeHandle, String pattern,
    */
   private static native long substring(long columnView, int start, int end) throws CudfException;
 
+  /**
+   * Native method to extract substrings from a given strings column.
+   * @param columnView native handle of the cudf::column_view being operated on.
+   * @param start      first character index to begin the substrings (inclusive).
+   */
+  private static native long substringS(long columnView, int start) throws CudfException;
+
   /**
    * Native method to calculate substring from a given string column.
    * @param columnView native handle of the cudf::column_view being operated on.
@@ -4034,6 +4082,16 @@ private static native long stringReplaceWithBackrefs(long columnView, String pat
    */
   private static native long containsRe(long cudfViewHandle, String pattern) throws CudfException;
 
+  /**
+   * Native method for checking if strings match the passed in like pattern
+   * and escape character.
+   * @param cudfViewHandle native handle of the cudf::column_view being operated on.
+   * @param patternHandle handle of scalar containing the string like pattern.
+   * @param escapeCharHandle handle of scalar containing the string escape character.
+   * @return native handle of the resulting cudf column containing the boolean results.
+   */
+  private static native long like(long cudfViewHandle, long patternHandle, long escapeCharHandle) throws CudfException;
+
   /**
    * Native method for checking if strings in a column contains a specified comparison string.
    * @param cudfViewHandle native handle of the cudf::column_view being operated on.
diff --git a/java/src/main/java/ai/rapids/cudf/Cuda.java b/java/src/main/java/ai/rapids/cudf/Cuda.java
index 56a754279fc..e1298e29925 100755
--- a/java/src/main/java/ai/rapids/cudf/Cuda.java
+++ b/java/src/main/java/ai/rapids/cudf/Cuda.java
@@ -388,7 +388,7 @@ static void asyncMemcpy(long dst, long src, long count, CudaMemcpyKind kind) {
 
   /**
    * Gets the major CUDA compute capability of the current device.
-   * 
+   *
    * For reference: https://developer.nvidia.com/cuda-gpus
    * Hardware Generation	Compute Capability
    *     Ampere	                8.x
@@ -398,15 +398,15 @@ static void asyncMemcpy(long dst, long src, long count, CudaMemcpyKind kind) {
    *     Maxwell                5.x
    *     Kepler	                3.x
    *     Fermi	                2.x
-   * 
+   *
    * @return The Major compute capability version number of the current CUDA device
    * @throws CudaException on any error
    */
-  public static native int getComputeCapabilityMajor() throws CudaException;  
+  public static native int getComputeCapabilityMajor() throws CudaException;
 
   /**
    * Gets the minor CUDA compute capability of the current device.
-   * 
+   *
    * For reference: https://developer.nvidia.com/cuda-gpus
    * Hardware Generation	Compute Capability
    *     Ampere	                8.x
@@ -416,7 +416,7 @@ static void asyncMemcpy(long dst, long src, long count, CudaMemcpyKind kind) {
    *     Maxwell                5.x
    *     Kepler	                3.x
    *     Fermi	                2.x
-   * 
+   *
    * @return The Minor compute capability version number of the current CUDA device
    * @throws CudaException on any error
    */
diff --git a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java
index 6e87f55e1ee..e48b1cf59e4 100644
--- a/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java
+++ b/java/src/main/java/ai/rapids/cudf/DeviceMemoryBufferView.java
@@ -36,4 +36,4 @@ public class DeviceMemoryBufferView extends BaseDeviceMemoryBuffer {
   public synchronized final DeviceMemoryBufferView slice(long offset, long len) {
     throw new UnsupportedOperationException("Slice on view is not supported");
   }
-}
\ No newline at end of file
+}
diff --git a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
index 8b1a9a63131..95d209c0984 100644
--- a/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
+++ b/java/src/main/java/ai/rapids/cudf/HostColumnVectorCore.java
@@ -399,7 +399,9 @@ public byte[] getBytesFromList(long rowIndex) {
     int size = end - start;
 
     byte[] result = new byte[size];
-    listData.offHeap.data.getBytes(result, 0, start, size);
+    if (size > 0) {
+      listData.offHeap.data.getBytes(result, 0, start, size);
+    }
     return result;
   }
 
diff --git a/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java b/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java
index 9f0d9a451c0..e6b3994235d 100644
--- a/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java
+++ b/java/src/main/java/ai/rapids/cudf/MemoryBuffer.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2019-2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -28,6 +28,23 @@
  * subclassing beyond what is included in CUDF is not recommended and not supported.
  */
 abstract public class MemoryBuffer implements AutoCloseable {
+  /**
+   * Interface to handle events for this MemoryBuffer. Only invoked during
+   * close, hence `onClosed` is the only event.
+   */
+  public interface EventHandler {
+    /**
+     * `onClosed` is invoked with the updated `refCount` during `close`.
+     * The last invocation of `onClosed` will be with `refCount=0`.
+     *
+     * @note the callback is invoked with this `MemoryBuffer`'s lock held.
+     *
+     * @param refCount - the updated ref count for this MemoryBuffer at the time
+     *                 of invocation
+     */
+    void onClosed(int refCount);
+  }
+
   private static final Logger log = LoggerFactory.getLogger(MemoryBuffer.class);
   protected final long address;
   protected final long length;
@@ -36,6 +53,8 @@ abstract public class MemoryBuffer implements AutoCloseable {
   protected final MemoryBufferCleaner cleaner;
   protected final long id;
 
+  private EventHandler eventHandler;
+
   public static abstract class MemoryBufferCleaner extends MemoryCleaner.Cleaner{}
 
   private static final class SlicedBufferCleaner extends MemoryBufferCleaner {
@@ -193,6 +212,27 @@ public final void copyFromMemoryBufferAsync(
    */
   public abstract MemoryBuffer slice(long offset, long len);
 
+  /**
+   * Set an event handler for this buffer. This method can be invoked with null
+   * to unset the handler.
+   *
+   * @param newHandler - the EventHandler to use from this point forward
+   * @return the prior event handler, or null if not set.
+   */
+  public synchronized EventHandler setEventHandler(EventHandler newHandler) {
+    EventHandler prev = this.eventHandler;
+    this.eventHandler = newHandler;
+    return prev;
+  }
+
+  /**
+   * Returns the current event handler for this buffer or null if no handler
+   * is associated or this buffer is closed.
+   */
+  public synchronized EventHandler getEventHandler() {
+    return this.eventHandler;
+  }
+
   /**
    * Close this buffer and free memory
    */
@@ -200,6 +240,9 @@ public synchronized void close() {
     if (cleaner != null) {
       refCount--;
       cleaner.delRef();
+      if (eventHandler != null) {
+        eventHandler.onClosed(refCount);
+      }
       if (refCount == 0) {
         cleaner.clean(false);
         closed = true;
@@ -232,8 +275,10 @@ public synchronized void incRefCount() {
     cleaner.addRef();
   }
 
-  // visible for testing
-  synchronized int getRefCount() {
+  /**
+   * Get the current reference count for this buffer.
+   */
+  public synchronized int getRefCount() {
     return refCount;
   }
 }
diff --git a/java/src/main/java/ai/rapids/cudf/NvtxRange.java b/java/src/main/java/ai/rapids/cudf/NvtxRange.java
index 813ab5e66cf..c683139f775 100644
--- a/java/src/main/java/ai/rapids/cudf/NvtxRange.java
+++ b/java/src/main/java/ai/rapids/cudf/NvtxRange.java
@@ -34,7 +34,7 @@
  * Instances should be associated with a single thread to avoid pushing an NVTX range in
  * one thread and then trying to pop the range in a different thread.
  *
- * Push/pop ranges show a stacking behavior in tools such as Nsight, where newly pushed 
+ * Push/pop ranges show a stacking behavior in tools such as Nsight, where newly pushed
  * ranges are correlated and enclosed by the prior pushed range (in the example above,
  * "b" is enclosed by "a").
  */
diff --git a/java/src/main/java/ai/rapids/cudf/OutOfBoundsPolicy.java b/java/src/main/java/ai/rapids/cudf/OutOfBoundsPolicy.java
index 36f39aa8ad3..262d7f1f2c3 100644
--- a/java/src/main/java/ai/rapids/cudf/OutOfBoundsPolicy.java
+++ b/java/src/main/java/ai/rapids/cudf/OutOfBoundsPolicy.java
@@ -32,7 +32,7 @@
  */
 public enum OutOfBoundsPolicy {
   /* Output values corresponding to out-of-bounds indices are null */
-  NULLIFY,  
+  NULLIFY,
 
   /* No bounds checking is performed, better performance */
   DONT_CHECK
diff --git a/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
new file mode 100644
index 00000000000..c34336ac73f
--- /dev/null
+++ b/java/src/main/java/ai/rapids/cudf/ParquetChunkedReader.java
@@ -0,0 +1,155 @@
+/*
+ *
+ *  Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ *  Licensed under the Apache License, Version 2.0 (the "License");
+ *  you may not use this file except in compliance with the License.
+ *  You may obtain a copy of the License at
+ *
+ *      http://www.apache.org/licenses/LICENSE-2.0
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ *
+ */
+
+package ai.rapids.cudf;
+
+import java.io.File;
+
+/**
+ * Provide an interface for reading a Parquet file in an iterative manner.
+ */
+public class ParquetChunkedReader implements AutoCloseable {
+  static {
+    NativeDepsLoader.loadNativeDeps();
+  }
+
+  /**
+   * Construct the reader instance from a read limit and a file path.
+   *
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param filePath Full path of the input Parquet file to read.
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, File filePath) {
+    this(chunkSizeByteLimit, ParquetOptions.DEFAULT, filePath);
+  }
+
+  /**
+   * Construct the reader instance from a read limit, a ParquetOptions object, and a file path.
+   *
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param opts The options for Parquet reading.
+   * @param filePath Full path of the input Parquet file to read.
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, File filePath) {
+    handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(),
+        filePath.getAbsolutePath(), 0, 0, opts.timeUnit().typeId.getNativeId());
+
+    if(handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
+    }
+  }
+
+  /**
+   * Construct the reader instance from a read limit and a file already read in a memory buffer.
+   *
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param opts The options for Parquet reading.
+   * @param buffer Raw Parquet file content.
+   * @param offset The starting offset into buffer.
+   * @param len The number of bytes to parse the given buffer.
+   */
+  public ParquetChunkedReader(long chunkSizeByteLimit, ParquetOptions opts, HostMemoryBuffer buffer,
+      long offset, long len) {
+    handle = create(chunkSizeByteLimit, opts.getIncludeColumnNames(), opts.getReadBinaryAsString(), null,
+        buffer.getAddress() + offset, len, opts.timeUnit().typeId.getNativeId());
+
+    if(handle == 0) {
+      throw new IllegalStateException("Cannot create native chunked Parquet reader object.");
+    }
+  }
+
+  /**
+   * Check if the given file has anything left to read.
+   *
+   * @return A boolean value indicating if there is more data to read from file.
+   */
+  public boolean hasNext() {
+    if(handle == 0) {
+      throw new IllegalStateException("Native chunked Parquet reader object may have been closed.");
+    }
+
+    if (firstCall) {
+      // This function needs to return true at least once, so an empty table
+      // (but having empty columns instead of no column) can be returned by readChunk()
+      // if the input file has no row.
+      firstCall = false;
+      return true;
+    }
+    return hasNext(handle);
+  }
+
+  /**
+   * Read a chunk of rows in the given Parquet file such that the returning data has total size
+   * does not exceed the given read limit. If the given file has no data, or all data has been read
+   * before by previous calls to this function, a null Table will be returned.
+   *
+   * @return A table of new rows reading from the given file.
+   */
+  public Table readChunk() {
+    if(handle == 0) {
+      throw new IllegalStateException("Native chunked Parquet reader object may have been closed.");
+    }
+
+    long[] columnPtrs = readChunk(handle);
+    return columnPtrs != null ? new Table(columnPtrs) : null;
+  }
+
+  @Override
+  public void close() {
+    if (handle != 0) {
+      close(handle);
+      handle = 0;
+    }
+  }
+
+
+  /**
+   * Auxiliary variable to help {@link #hasNext()} returning true at least once.
+   */
+  private boolean firstCall = true;
+
+  /**
+   * Handle for memory address of the native Parquet chunked reader class.
+   */
+  private long handle;
+
+
+  /**
+   * Create a native chunked Parquet reader object on heap and return its memory address.
+   *
+   * @param chunkSizeByteLimit Limit on total number of bytes to be returned per read,
+   *                           or 0 if there is no limit.
+   * @param filterColumnNames Name of the columns to read, or an empty array if we want to read all.
+   * @param binaryToString Whether to convert the corresponding column to String if it is binary.
+   * @param filePath Full path of the file to read, or given as null if reading from a buffer.
+   * @param bufferAddrs The address of a buffer to read from, or 0 if we are not using that buffer.
+   * @param length The length of the buffer to read from.
+   * @param timeUnit Return type of time unit for timestamps.
+   */
+  private static native long create(long chunkSizeByteLimit, String[] filterColumnNames,
+      boolean[] binaryToString, String filePath, long bufferAddrs, long length, int timeUnit);
+
+  private static native boolean hasNext(long handle);
+
+  private static native long[] readChunk(long handle);
+
+  private static native void close(long handle);
+}
diff --git a/java/src/main/java/ai/rapids/cudf/ReplacePolicyWithColumn.java b/java/src/main/java/ai/rapids/cudf/ReplacePolicyWithColumn.java
index 5702f623ee1..d0a072aaa2c 100644
--- a/java/src/main/java/ai/rapids/cudf/ReplacePolicyWithColumn.java
+++ b/java/src/main/java/ai/rapids/cudf/ReplacePolicyWithColumn.java
@@ -43,4 +43,4 @@ public boolean equals(Object other) {
   public int hashCode() {
     return 31 * column + policy.hashCode();
   }
-}
\ No newline at end of file
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Rmm.java b/java/src/main/java/ai/rapids/cudf/Rmm.java
index 730f82f0047..a8ca8a2c4d3 100755
--- a/java/src/main/java/ai/rapids/cudf/Rmm.java
+++ b/java/src/main/java/ai/rapids/cudf/Rmm.java
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019-2021, NVIDIA CORPORATION.
+ * Copyright (c) 2019-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -136,20 +136,79 @@ public static boolean isInitialized() throws RmmException {
    */
   public static native long getTotalBytesAllocated();
 
+  /**
+   * Returns the maximum amount of RMM memory (Bytes) outstanding during the
+   * lifetime of the process.
+   */
+  public static native long getMaximumTotalBytesAllocated();
+
+  /**
+   * Resets a scoped maximum counter of RMM memory used to keep track of usage between
+   * code sections while debugging.
+   *
+   * @param initialValue an initial value (in Bytes) to use for this scoped counter
+   */
+  public static void resetScopedMaximumBytesAllocated(long initialValue) {
+    resetScopedMaximumBytesAllocatedInternal(initialValue);
+  }
+
+  /**
+   * Resets a scoped maximum counter of RMM memory used to keep track of usage between
+   * code sections while debugging.
+   *
+   * This resets the counter to 0 Bytes.
+   */
+  public static void resetScopedMaximumBytesAllocated() {
+    resetScopedMaximumBytesAllocatedInternal(0L);
+  }
+
+  private static native void resetScopedMaximumBytesAllocatedInternal(long initialValue);
+
+  /**
+   * Returns the maximum amount of RMM memory (Bytes) outstanding since the last
+   * `resetScopedMaximumOutstanding` call was issued (it is "scoped" because it's the
+   * maximum amount seen since the last reset).
+   *
+   * If the memory used is net negative (for example if only frees happened since
+   * reset, and we reset to 0), then result will be 0.
+   *
+   * If `resetScopedMaximumBytesAllocated` is never called, the scope is the whole
+   * program and is equivalent to `getMaximumTotalBytesAllocated`.
+   *
+   * @return the scoped maximum bytes allocated
+   */
+  public static native long getScopedMaximumBytesAllocated();
+
   /**
    * Sets the event handler to be called on RMM events (e.g.: allocation failure).
    * @param handler event handler to invoke on RMM events or null to clear an existing handler
    * @throws RmmException if an active handler is already set
    */
   public static void setEventHandler(RmmEventHandler handler) throws RmmException {
+    setEventHandler(handler, false);
+  }
+
+  /**
+   * Sets the event handler to be called on RMM events (e.g.: allocation failure) and
+   * optionally enable debug mode (callbacks on every allocate and deallocate)
+   *
+   * NOTE: Only enable debug mode when necessary, as code will run much slower!
+   *
+   * @param handler event handler to invoke on RMM events or null to clear an existing handler
+   * @param enableDebug if true enable debug callbacks in RmmEventHandler
+   *                    (onAllocated, onDeallocated)
+   * @throws RmmException if an active handler is already set
+   */
+  public static void setEventHandler(RmmEventHandler handler,
+                                     boolean enableDebug) throws RmmException {
     long[] allocThresholds = (handler != null) ? sortThresholds(handler.getAllocThresholds()) : null;
     long[] deallocThresholds = (handler != null) ? sortThresholds(handler.getDeallocThresholds()) : null;
-    setEventHandlerInternal(handler, allocThresholds, deallocThresholds);
+    setEventHandlerInternal(handler, allocThresholds, deallocThresholds, enableDebug);
   }
 
   /** Clears the active RMM event handler if one is set. */
   public static void clearEventHandler() throws RmmException {
-    setEventHandlerInternal(null, null, null);
+    setEventHandlerInternal(null, null, null, false);
   }
 
   private static long[] sortThresholds(long[] thresholds) {
@@ -257,7 +316,8 @@ public static DeviceMemoryBuffer alloc(long size, Cuda.Stream stream) {
   static native void freeDeviceBuffer(long rmmBufferAddress) throws RmmException;
 
   static native void setEventHandlerInternal(RmmEventHandler handler,
-      long[] allocThresholds, long[] deallocThresholds) throws RmmException;
+      long[] allocThresholds, long[] deallocThresholds,
+      boolean enableDebug) throws RmmException;
 
   /**
    * Allocate device memory using `cudaMalloc` and return a pointer to device memory.
diff --git a/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java b/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java
index 85442402403..347ef471a15 100644
--- a/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java
+++ b/java/src/main/java/ai/rapids/cudf/RmmEventHandler.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2020, NVIDIA CORPORATION.
+ *  Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -22,9 +22,40 @@ public interface RmmEventHandler {
   /**
    * Invoked on a memory allocation failure.
    * @param sizeRequested number of bytes that failed to allocate
+   * @deprecated deprecated in favor of onAllocFailure(long, boolean)
    * @return true if the memory allocation should be retried or false if it should fail
    */
-  boolean onAllocFailure(long sizeRequested);
+  default boolean onAllocFailure(long sizeRequested) {
+    // this should not be called since it was the previous interface,
+    // and it was abstract before, throwing by default for good measure.
+    throw new UnsupportedOperationException(
+        "Unexpected invocation of deprecated onAllocFailure without retry count.");
+  }
+
+  /**
+   * Invoked after every memory allocation when debug mode is enabled.
+   * @param size number of bytes allocated
+   */
+  default void onAllocated(long size) {}
+
+  /**
+   * Invoked after every memory deallocation when debug mode is enabled.
+   * @param size number of bytes deallocated
+   */
+  default void onDeallocated(long size) {}
+
+  /**
+   * Invoked on a memory allocation failure.
+   * @param sizeRequested number of bytes that failed to allocate
+   * @param retryCount number of times this allocation has been retried after failure
+   * @return true if the memory allocation should be retried or false if it should fail
+   */
+  default boolean onAllocFailure(long sizeRequested, int retryCount) {
+    // newer code should override this implementation of `onAllocFailure` to handle
+    // `retryCount`. Otherwise, we call the prior implementation to not
+    // break existing code.
+    return onAllocFailure(sizeRequested);
+  }
 
   /**
    * Get the memory thresholds that will trigger {@link #onAllocThreshold(long)}
diff --git a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
index 718e17c7f5c..a460bd46ab4 100644
--- a/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
+++ b/java/src/main/java/ai/rapids/cudf/RollingAggregation.java
@@ -220,7 +220,7 @@ public static RollingAggregation collectSet(NullPolicy nullPolicy, NullEquality
 
   /**
    * Select the nth element from a specified window.
-   * 
+   *
    * @param n          Indicates the index of the element to be selected from the window
    * @param nullPolicy Indicates whether null elements are to be skipped, or not
    */
diff --git a/java/src/main/java/ai/rapids/cudf/StreamedTableReader.java b/java/src/main/java/ai/rapids/cudf/StreamedTableReader.java
index aae86116c9b..d845edc058d 100644
--- a/java/src/main/java/ai/rapids/cudf/StreamedTableReader.java
+++ b/java/src/main/java/ai/rapids/cudf/StreamedTableReader.java
@@ -39,4 +39,4 @@ public interface StreamedTableReader extends AutoCloseable {
 
     @Override
     void close() throws CudfException;
-}
\ No newline at end of file
+}
diff --git a/java/src/main/java/ai/rapids/cudf/Table.java b/java/src/main/java/ai/rapids/cudf/Table.java
index dbc2a28c38c..c6f606e971d 100644
--- a/java/src/main/java/ai/rapids/cudf/Table.java
+++ b/java/src/main/java/ai/rapids/cudf/Table.java
@@ -706,10 +706,10 @@ private static native long[] dropDuplicates(long nativeHandle, int[] keyColumns,
   private static native long[] gather(long tableHandle, long gatherView, boolean checkBounds);
 
   private static native long[] scatterTable(long srcTableHandle, long scatterView,
-                                            long targetTableHandle, boolean checkBounds)
+                                            long targetTableHandle)
                                             throws CudfException;
   private static native long[] scatterScalars(long[] srcScalarHandles, long scatterView,
-                                             long targetTableHandle, boolean checkBounds)
+                                             long targetTableHandle)
                                              throws CudfException;
 
   private static native long[] convertToRows(long nativeHandle);
@@ -723,8 +723,7 @@ private static native long[] scatterScalars(long[] srcScalarHandles, long scatte
   private static native long[] repeatStaticCount(long tableHandle, int count);
 
   private static native long[] repeatColumnCount(long tableHandle,
-                                                 long columnHandle,
-                                                 boolean checkCount);
+                                                 long columnHandle);
 
   private static native long rowBitCount(long tableHandle) throws CudfException;
 
@@ -1686,22 +1685,7 @@ public Table repeat(int count) {
    * @throws CudfException on any error.
    */
   public Table repeat(ColumnView counts) {
-    return repeat(counts, true);
-  }
-
-  /**
-   * Create a new table by repeating each row of this table. The number of
-   * repetitions of each row is defined by the corresponding value in counts.
-   * @param counts the number of times to repeat each row. Cannot have nulls, must be an
-   *               Integer type, and must have one entry for each row in the table.
-   * @param checkCount should counts be checked for errors before processing. Be careful if you
-   *                   disable this because if you pass in bad data you might just get back an
-   *                   empty table or bad data.
-   * @return the new Table.
-   * @throws CudfException on any error.
-   */
-  public Table repeat(ColumnView counts, boolean checkCount) {
-    return new Table(repeatColumnCount(this.nativeHandle, counts.getNativeView(), checkCount));
+    return new Table(repeatColumnCount(this.nativeHandle, counts.getNativeView()));
   }
 
   /**
@@ -2349,14 +2333,11 @@ public Table gather(ColumnView gatherMap, OutOfBoundsPolicy outOfBoundsPolicy) {
    *
    * @param scatterMap The map of indexes. Must be non-nullable and integral type.
    * @param target The table into which rows from the current table are to be scattered out-of-place.
-   * @param checkBounds Optionally perform bounds checking on the values of`scatterMap` and throw
-   *                    an exception if any of its values are out of bounds.
    * @return A new table which is the result of out-of-place scattering the source table into the
    *         target table.
    */
-  public Table scatter(ColumnView scatterMap, Table target, boolean checkBounds) {
-    return new Table(scatterTable(nativeHandle, scatterMap.getNativeView(), target.getNativeView(),
-        checkBounds));
+  public Table scatter(ColumnView scatterMap, Table target) {
+    return new Table(scatterTable(nativeHandle, scatterMap.getNativeView(), target.getNativeView()));
   }
 
   /**
@@ -2376,20 +2357,17 @@ public Table scatter(ColumnView scatterMap, Table target, boolean checkBounds) {
    * @param source The input scalars containing values to be scattered into the target table.
    * @param scatterMap The map of indexes. Must be non-nullable and integral type.
    * @param target The table into which the values from source are to be scattered out-of-place.
-   * @param checkBounds Optionally perform bounds checking on the values of`scatterMap` and throw
-   *                    an exception if any of its values are out of bounds.
    * @return A new table which is the result of out-of-place scattering the source values into the
    *         target table.
    */
-  public static Table scatter(Scalar[] source, ColumnView scatterMap, Table target,
-                              boolean checkBounds) {
+  public static Table scatter(Scalar[] source, ColumnView scatterMap, Table target) {
     long[] srcScalarHandles = new long[source.length];
     for(int i = 0; i < source.length; ++i) {
       assert source[i] != null : "Scalar vectors passed in should not contain null";
       srcScalarHandles[i] = source[i].getScalarHandle();
     }
     return new Table(scatterScalars(srcScalarHandles, scatterMap.getNativeView(),
-        target.getNativeView(), checkBounds));
+        target.getNativeView()));
   }
 
   private static GatherMap[] buildJoinGatherMaps(long[] gatherMapData) {
diff --git a/java/src/main/java/ai/rapids/cudf/WindowOptions.java b/java/src/main/java/ai/rapids/cudf/WindowOptions.java
index 6dd59e0f2fc..6ab5c0525ca 100644
--- a/java/src/main/java/ai/rapids/cudf/WindowOptions.java
+++ b/java/src/main/java/ai/rapids/cudf/WindowOptions.java
@@ -328,4 +328,4 @@ public void close() {
       followingCol.close();
     }
   }
-}
\ No newline at end of file
+}
diff --git a/java/src/main/native/.clang-format b/java/src/main/native/.clang-format
index 34ba71310cc..e0866533a36 100644
--- a/java/src/main/native/.clang-format
+++ b/java/src/main/native/.clang-format
@@ -8,12 +8,12 @@ AlignAfterOpenBracket: Align
 # int aaaa = 12;
 # int b    = 23;
 # int ccc  = 23;
-# leaving OFF 
+# leaving OFF
 AlignConsecutiveAssignments: false
 # int         aaaa = 12;
 # float       b = 23;
 # std::string ccc = 23;
-# leaving OFF 
+# leaving OFF
 AlignConsecutiveDeclarations: false
 ##define A                                                                      \
 #  int aaaa;                                                                    \
@@ -59,14 +59,14 @@ AlwaysBreakAfterReturnType: None
 AlwaysBreakBeforeMultilineStrings: false
 AlwaysBreakTemplateDeclarations: MultiLine
 
-# if all the arguments for a function don't fit in a single line, 
+# if all the arguments for a function don't fit in a single line,
 # with a value of "false", it'll split each argument into different lines
 BinPackArguments: true
 BinPackParameters: true
 
 # if this is set to Custom, the BraceWrapping flags apply
 BreakBeforeBraces: Custom
-BraceWrapping:   
+BraceWrapping:
   AfterClass:      false
   AfterControlStatement: false
   AfterEnum:       false
@@ -129,12 +129,12 @@ DisableFormat:   false
 ExperimentalAutoDetectBinPacking: false
 # } // namespace a => useful
 FixNamespaceComments: true
-ForEachMacros:   
+ForEachMacros:
   - foreach
   - Q_FOREACH
   - BOOST_FOREACH
 IncludeBlocks:   Regroup
-IncludeCategories: 
+IncludeCategories:
   - Regex:           '<[[:alnum:]]+>'
     Priority:        0
   - Regex:           '<[[:alnum:].]+>'
@@ -146,7 +146,7 @@ IncludeCategories:
   - Regex:           '.*'
     Priority:        4
 # if a header matches this in an include group, it will be moved up to the
-# top of the group. 
+# top of the group.
 IncludeIsMainRegex: '(Test)?$'
 IndentCaseLabels: true
 IndentPPDirectives: None
@@ -202,4 +202,3 @@ Standard:        Cpp11
 TabWidth:        8
 UseTab:          Never
 ...
-
diff --git a/java/src/main/native/CMakeLists.txt b/java/src/main/native/CMakeLists.txt
index fb11c2ee213..ac05b16b39a 100755
--- a/java/src/main/native/CMakeLists.txt
+++ b/java/src/main/native/CMakeLists.txt
@@ -28,7 +28,7 @@ rapids_cuda_init_architectures(CUDF_JNI)
 
 project(
   CUDF_JNI
-  VERSION 22.10.01
+  VERSION 22.12.00
   LANGUAGES C CXX CUDA
 )
 
@@ -130,6 +130,7 @@ add_library(
   cudfjni
   src/Aggregation128UtilsJni.cpp
   src/AggregationJni.cpp
+  src/ChunkedReaderJni.cpp
   src/CudfJni.cpp
   src/CudaJni.cpp
   src/ColumnVectorJni.cpp
diff --git a/java/src/main/native/include/maps_column_view.hpp b/java/src/main/native/include/maps_column_view.hpp
index b9b60f4e3b2..5ac8d5c5713 100644
--- a/java/src/main/native/include/maps_column_view.hpp
+++ b/java/src/main/native/include/maps_column_view.hpp
@@ -38,7 +38,7 @@ namespace jni {
 class maps_column_view {
 public:
   maps_column_view(lists_column_view const &lists_of_structs,
-                   rmm::cuda_stream_view stream = cudf::default_stream_value);
+                   rmm::cuda_stream_view stream = cudf::get_default_stream());
 
   // Rule of 5.
   maps_column_view(maps_column_view const &maps_view) = default;
@@ -82,7 +82,7 @@ class maps_column_view {
    * @return std::unique_ptr<column> Column of values corresponding the value of the lookup key.
    */
   std::unique_ptr<column> get_values_for(
-      column_view const &keys, rmm::cuda_stream_view stream = cudf::default_stream_value,
+      column_view const &keys, rmm::cuda_stream_view stream = cudf::get_default_stream(),
       rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
 
   /**
@@ -100,7 +100,7 @@ class maps_column_view {
    * @return std::unique_ptr<column>
    */
   std::unique_ptr<column> get_values_for(
-      scalar const &key, rmm::cuda_stream_view stream = cudf::default_stream_value,
+      scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
       rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
 
   /**
@@ -120,7 +120,7 @@ class maps_column_view {
    * @return std::unique_ptr<column>
    */
   std::unique_ptr<column>
-  contains(scalar const &key, rmm::cuda_stream_view stream = cudf::default_stream_value,
+  contains(scalar const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
            rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
 
   /**
@@ -141,7 +141,7 @@ class maps_column_view {
    */
 
   std::unique_ptr<column>
-  contains(column_view const &key, rmm::cuda_stream_view stream = cudf::default_stream_value,
+  contains(column_view const &key, rmm::cuda_stream_view stream = cudf::get_default_stream(),
            rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource()) const;
 
 private:
diff --git a/java/src/main/native/src/ChunkedReaderJni.cpp b/java/src/main/native/src/ChunkedReaderJni.cpp
new file mode 100644
index 00000000000..553ec46d569
--- /dev/null
+++ b/java/src/main/native/src/ChunkedReaderJni.cpp
@@ -0,0 +1,124 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <memory>
+#include <vector>
+
+#include <cudf/column/column.hpp>
+#include <cudf/io/parquet.hpp>
+#include <cudf/table/table.hpp>
+
+#include "cudf_jni_apis.hpp"
+#include "jni_utils.hpp"
+
+// This function is defined in `TableJni.cpp`.
+jlongArray
+cudf::jni::convert_table_for_return(JNIEnv *env, std::unique_ptr<cudf::table> &&table_result,
+                                    std::vector<std::unique_ptr<cudf::column>> &&extra_columns);
+
+// This file is for the code releated to chunked reader (Parquet, ORC, etc.).
+
+extern "C" {
+
+// This function should take all the parameters that `Table.readParquet` takes,
+// plus one more parameter `long chunkSizeByteLimit`.
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_create(
+    JNIEnv *env, jclass, jlong chunk_read_limit, jobjectArray filter_col_names,
+    jbooleanArray j_col_binary_read, jstring inp_file_path, jlong buffer, jlong buffer_length,
+    jint unit) {
+  JNI_NULL_CHECK(env, j_col_binary_read, "Null col_binary_read", 0);
+  bool read_buffer = true;
+  if (buffer == 0) {
+    JNI_NULL_CHECK(env, inp_file_path, "Input file or buffer must be supplied", 0);
+    read_buffer = false;
+  } else if (inp_file_path != nullptr) {
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException",
+                  "Cannot pass in both a buffer and an inp_file_path", 0);
+  } else if (buffer_length <= 0) {
+    JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "An empty buffer is not supported", 0);
+  }
+
+  try {
+    cudf::jni::auto_set_device(env);
+    cudf::jni::native_jstring filename(env, inp_file_path);
+    if (!read_buffer && filename.is_empty()) {
+      JNI_THROW_NEW(env, "java/lang/IllegalArgumentException", "inp_file_path cannot be empty", 0);
+    }
+
+    cudf::jni::native_jstringArray n_filter_col_names(env, filter_col_names);
+
+    // TODO: This variable is unused now, but we still don't know what to do with it yet.
+    // As such, it needs to stay here for a little more time before we decide to use it again,
+    // or remove it completely.
+    cudf::jni::native_jbooleanArray n_col_binary_read(env, j_col_binary_read);
+    (void)n_col_binary_read;
+
+    auto const source = read_buffer ?
+                            cudf::io::source_info(reinterpret_cast<char *>(buffer),
+                                                  static_cast<std::size_t>(buffer_length)) :
+                            cudf::io::source_info(filename.get());
+
+    auto opts_builder = cudf::io::parquet_reader_options::builder(source);
+    if (n_filter_col_names.size() > 0) {
+      opts_builder = opts_builder.columns(n_filter_col_names.as_cpp_vector());
+    }
+    auto const read_opts = opts_builder.convert_strings_to_categories(false)
+                               .timestamp_type(cudf::data_type(static_cast<cudf::type_id>(unit)))
+                               .build();
+
+    return reinterpret_cast<jlong>(new cudf::io::chunked_parquet_reader(
+        static_cast<std::size_t>(chunk_read_limit), read_opts));
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT jboolean JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_hasNext(JNIEnv *env, jclass,
+                                                                            jlong handle) {
+  JNI_NULL_CHECK(env, handle, "handle is null", false);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader *const>(handle);
+    return reader_ptr->has_next();
+  }
+  CATCH_STD(env, false);
+}
+
+JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_readChunk(JNIEnv *env, jclass,
+                                                                                jlong handle) {
+  JNI_NULL_CHECK(env, handle, "handle is null", 0);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const reader_ptr = reinterpret_cast<cudf::io::chunked_parquet_reader *const>(handle);
+    auto chunk = reader_ptr->read_chunk();
+    return chunk.tbl ? cudf::jni::convert_table_for_return(env, chunk.tbl) : nullptr;
+  }
+  CATCH_STD(env, 0);
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_ParquetChunkedReader_close(JNIEnv *env, jclass,
+                                                                      jlong handle) {
+  JNI_NULL_CHECK(env, handle, "handle is null", );
+
+  try {
+    cudf::jni::auto_set_device(env);
+    delete reinterpret_cast<cudf::io::chunked_parquet_reader *>(handle);
+  }
+  CATCH_STD(env, );
+}
+
+} // extern "C"
diff --git a/java/src/main/native/src/ColumnVectorJni.cpp b/java/src/main/native/src/ColumnVectorJni.cpp
index f01d832eb19..39d58f896ea 100644
--- a/java/src/main/native/src/ColumnVectorJni.cpp
+++ b/java/src/main/native/src/ColumnVectorJni.cpp
@@ -307,8 +307,9 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnVector_concatenate(JNIEnv *env
     auto columns =
         cudf::jni::native_jpointerArray<column_view>{env, column_handles}.get_dereferenced();
     auto const is_lists_column = columns[0].type().id() == cudf::type_id::LIST;
-    return release_as_jlong(is_lists_column ? cudf::lists::detail::concatenate(columns) :
-                                              cudf::concatenate(columns));
+    return release_as_jlong(
+        is_lists_column ? cudf::lists::detail::concatenate(columns, cudf::get_default_stream()) :
+                          cudf::concatenate(columns));
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/main/native/src/ColumnViewJni.cpp b/java/src/main/native/src/ColumnViewJni.cpp
index f16ead009a8..4acc14c760c 100644
--- a/java/src/main/native/src/ColumnViewJni.cpp
+++ b/java/src/main/native/src/ColumnViewJni.cpp
@@ -69,7 +69,7 @@
 #include <cudf/strings/strip.hpp>
 #include <cudf/strings/substring.hpp>
 #include <cudf/structs/structs_column_view.hpp>
-#include <cudf/tdigest/tdigest_column_view.cuh>
+#include <cudf/tdigest/tdigest_column_view.hpp>
 #include <cudf/transform.hpp>
 #include <cudf/types.hpp>
 #include <cudf/unary.hpp>
@@ -263,10 +263,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_reduce(JNIEnv *env, jclas
     auto agg = reinterpret_cast<cudf::aggregation *>(j_agg);
     cudf::data_type out_dtype = cudf::jni::make_data_type(j_dtype, scale);
     return release_as_jlong(
-        cudf::reduce(*col,
-                     std::unique_ptr<cudf::reduce_aggregation>(
-                         dynamic_cast<cudf::reduce_aggregation *>(agg->clone().release())),
-                     out_dtype));
+        cudf::reduce(*col, *dynamic_cast<cudf::reduce_aggregation *>(agg), out_dtype));
   }
   CATCH_STD(env, 0);
 }
@@ -321,10 +318,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_scan(JNIEnv *env, jclass,
     auto scan_type = is_inclusive ? cudf::scan_type::INCLUSIVE : cudf::scan_type::EXCLUSIVE;
     auto null_policy = include_nulls ? cudf::null_policy::INCLUDE : cudf::null_policy::EXCLUDE;
     return release_as_jlong(
-        cudf::scan(*col,
-                   std::unique_ptr<cudf::scan_aggregation>(
-                       dynamic_cast<cudf::scan_aggregation *>(agg->clone().release())),
-                   scan_type, null_policy));
+        cudf::scan(*col, *dynamic_cast<cudf::scan_aggregation *>(agg), scan_type, null_policy));
   }
   CATCH_STD(env, 0);
 }
@@ -486,7 +480,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_dropListDuplicatesWithKey
                   "Input column has child that does not have 2 children.", 0);
 
     return release_as_jlong(
-        cudf::jni::lists_distinct_by_key(lists_keys_vals, cudf::default_stream_value));
+        cudf::jni::lists_distinct_by_key(lists_keys_vals, cudf::get_default_stream()));
   }
   CATCH_STD(env, 0);
 }
@@ -1304,6 +1298,24 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_containsRe(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_like(JNIEnv *env, jobject j_object,
+                                                            jlong j_view_handle, jlong pattern,
+                                                            jlong escapeChar) {
+  JNI_NULL_CHECK(env, j_view_handle, "column is null", false);
+  JNI_NULL_CHECK(env, pattern, "pattern is null", false);
+  JNI_NULL_CHECK(env, escapeChar, "escape character is null", false);
+
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const column_view = reinterpret_cast<cudf::column_view const *>(j_view_handle);
+    auto const strings_column = cudf::strings_column_view{*column_view};
+    auto const pattern_scalar = reinterpret_cast<cudf::string_scalar const *>(pattern);
+    auto const escape_scalar = reinterpret_cast<cudf::string_scalar const *>(escapeChar);
+    return release_as_jlong(cudf::strings::like(strings_column, *pattern_scalar, *escape_scalar));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, jclass,
                                                                   jlong lhs_view, jlong rhs_view,
                                                                   jint int_op, jint out_dtype,
@@ -1328,8 +1340,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVV(JNIEnv *env, j
       }
 
       auto out_view = out->mutable_view();
-      cudf::binops::compiled::detail::apply_sorting_struct_binary_op(out_view, *lhs, *rhs, false,
-                                                                     false, op);
+      cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
+          out_view, *lhs, *rhs, false, false, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -1375,8 +1387,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, j
 
       auto rhsv = cudf::make_column_from_scalar(*rhs, 1);
       auto out_view = out->mutable_view();
-      cudf::binops::compiled::detail::apply_sorting_struct_binary_op(out_view, *lhs, rhsv->view(),
-                                                                     false, true, op);
+      cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
+          out_view, *lhs, rhsv->view(), false, true, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
@@ -1385,6 +1397,18 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_binaryOpVS(JNIEnv *env, j
   CATCH_STD(env, 0);
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substringS(JNIEnv *env, jclass,
+                                                                  jlong cv_handle, jint start) {
+  JNI_NULL_CHECK(env, cv_handle, "column is null", 0);
+  try {
+    cudf::jni::auto_set_device(env);
+    auto const cv = reinterpret_cast<cudf::column_view const *>(cv_handle);
+    auto const scv = cudf::strings_column_view{*cv};
+    return release_as_jlong(cudf::strings::slice_strings(scv, start));
+  }
+  CATCH_STD(env, 0);
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(JNIEnv *env, jclass,
                                                                  jlong column_view, jint start,
                                                                  jint end) {
@@ -1393,8 +1417,7 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_ColumnView_substring(JNIEnv *env, jc
     cudf::jni::auto_set_device(env);
     cudf::column_view *cv = reinterpret_cast<cudf::column_view *>(column_view);
     cudf::strings_column_view scv(*cv);
-    return release_as_jlong((end == -1 ? cudf::strings::slice_strings(scv, start) :
-                                         cudf::strings::slice_strings(scv, start, end)));
+    return release_as_jlong(cudf::strings::slice_strings(scv, start, end));
   }
   CATCH_STD(env, 0);
 }
diff --git a/java/src/main/native/src/ColumnViewJni.cu b/java/src/main/native/src/ColumnViewJni.cu
index a3f9ab5928d..f52e98b90b9 100644
--- a/java/src/main/native/src/ColumnViewJni.cu
+++ b/java/src/main/native/src/ColumnViewJni.cu
@@ -52,10 +52,10 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
   auto validity_begin = cudf::detail::make_optional_iterator<bool>(
       *validity_device_view, cudf::nullate::DYNAMIC{validity_column.has_nulls()});
   auto validity_end = validity_begin + validity_device_view->size();
-  auto [null_mask, null_count] =
-      cudf::detail::valid_if(validity_begin, validity_end, [] __device__(auto optional_bool) {
-        return optional_bool.value_or(false);
-      });
+  auto [null_mask, null_count] = cudf::detail::valid_if(
+      validity_begin, validity_end,
+      [] __device__(auto optional_bool) { return optional_bool.value_or(false); },
+      cudf::get_default_stream());
   auto const exemplar_without_null_mask = cudf::column_view{
       exemplar.type(),
       exemplar.size(),
@@ -152,8 +152,8 @@ void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view c
                    });
 
   // Create a new nullmask from the validity data.
-  auto [new_null_mask, new_null_count] =
-      cudf::detail::valid_if(validity.begin(), validity.end(), thrust::identity{});
+  auto [new_null_mask, new_null_count] = cudf::detail::valid_if(
+      validity.begin(), validity.end(), thrust::identity{}, cudf::get_default_stream());
 
   if (new_null_count > 0) {
     // If the `overlap_result` column is nullable, perform `bitmask_and` of its nullmask and the
diff --git a/java/src/main/native/src/ColumnViewJni.hpp b/java/src/main/native/src/ColumnViewJni.hpp
index 2cbdb65653e..29158cbd98f 100644
--- a/java/src/main/native/src/ColumnViewJni.hpp
+++ b/java/src/main/native/src/ColumnViewJni.hpp
@@ -51,7 +51,7 @@ new_column_with_boolean_column_as_validity(cudf::column_view const &exemplar,
  */
 std::unique_ptr<cudf::column>
 generate_list_offsets(cudf::column_view const &list_length,
-                      rmm::cuda_stream_view stream = cudf::default_stream_value);
+                      rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Perform a special treatment for the results of `cudf::lists::have_overlap` to produce the
@@ -73,7 +73,7 @@ generate_list_offsets(cudf::column_view const &list_length,
  */
 void post_process_list_overlap(cudf::column_view const &lhs, cudf::column_view const &rhs,
                                std::unique_ptr<cudf::column> const &overlap_result,
-                               rmm::cuda_stream_view stream = cudf::default_stream_value);
+                               rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Generates lists column by copying elements that are distinct by key from each input list
diff --git a/java/src/main/native/src/RmmJni.cpp b/java/src/main/native/src/RmmJni.cpp
index ce3e6ffb285..b12f1ed0841 100644
--- a/java/src/main/native/src/RmmJni.cpp
+++ b/java/src/main/native/src/RmmJni.cpp
@@ -19,6 +19,7 @@
 #include <fstream>
 #include <iostream>
 #include <limits>
+#include <mutex>
 
 #include <rmm/mr/device/aligned_resource_adaptor.hpp>
 #include <rmm/mr/device/arena_memory_resource.hpp>
@@ -50,6 +51,12 @@ constexpr char const *RMM_EXCEPTION_CLASS = "ai/rapids/cudf/RmmException";
 class base_tracking_resource_adaptor : public device_memory_resource {
 public:
   virtual std::size_t get_total_allocated() = 0;
+
+  virtual std::size_t get_max_total_allocated() = 0;
+
+  virtual void reset_scoped_max_total_allocated(std::size_t initial_value) = 0;
+
+  virtual std::size_t get_scoped_max_total_allocated() = 0;
 };
 
 /**
@@ -79,11 +86,38 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
   std::size_t get_total_allocated() override { return total_allocated.load(); }
 
+  std::size_t get_max_total_allocated() override { return max_total_allocated; }
+
+  void reset_scoped_max_total_allocated(std::size_t initial_value) override {
+    std::scoped_lock lock(max_total_allocated_mutex);
+    scoped_allocated = initial_value;
+    scoped_max_total_allocated = initial_value;
+  }
+
+  std::size_t get_scoped_max_total_allocated() override {
+    std::scoped_lock lock(max_total_allocated_mutex);
+    return scoped_max_total_allocated;
+  }
+
 private:
   Upstream *const resource;
   std::size_t const size_align;
+  // sum of what is currently allocated
   std::atomic_size_t total_allocated{0};
 
+  // the maximum total allocated for the lifetime of this class
+  std::size_t max_total_allocated{0};
+
+  // the sum of what is currently outstanding from the last
+  // `reset_scoped_max_total_allocated` call. This can be negative.
+  std::atomic_long scoped_allocated{0};
+
+  // the maximum total allocated relative to the last
+  // `reset_scoped_max_total_allocated` call.
+  long scoped_max_total_allocated{0};
+
+  std::mutex max_total_allocated_mutex;
+
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     // adjust size of allocation based on specified size alignment
     num_bytes = (num_bytes + size_align - 1) / size_align * size_align;
@@ -91,6 +125,10 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
     auto result = resource->allocate(num_bytes, stream);
     if (result) {
       total_allocated += num_bytes;
+      scoped_allocated += num_bytes;
+      std::scoped_lock lock(max_total_allocated_mutex);
+      max_total_allocated = std::max(total_allocated.load(), max_total_allocated);
+      scoped_max_total_allocated = std::max(scoped_allocated.load(), scoped_max_total_allocated);
     }
     return result;
   }
@@ -102,6 +140,7 @@ class tracking_resource_adaptor final : public base_tracking_resource_adaptor {
 
     if (p) {
       total_allocated -= size;
+      scoped_allocated -= size;
     }
   }
 
@@ -132,11 +171,31 @@ std::size_t get_total_bytes_allocated() {
   return 0;
 }
 
+std::size_t get_max_total_allocated() {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->get_max_total_allocated();
+  }
+  return 0;
+}
+
+void reset_scoped_max_total_allocated(std::size_t initial_value) {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->reset_scoped_max_total_allocated(initial_value);
+  }
+}
+
+std::size_t get_scoped_max_total_allocated() {
+  if (Tracking_memory_resource) {
+    return Tracking_memory_resource->get_scoped_max_total_allocated();
+  }
+  return 0;
+}
+
 /**
  * @brief An RMM device memory resource adaptor that delegates to the wrapped resource
  * for most operations but will call Java to handle certain situations (e.g.: allocation failure).
  */
-class java_event_handler_memory_resource final : public device_memory_resource {
+class java_event_handler_memory_resource : public device_memory_resource {
 public:
   java_event_handler_memory_resource(JNIEnv *env, jobject jhandler, jlongArray jalloc_thresholds,
                                      jlongArray jdealloc_thresholds,
@@ -150,9 +209,15 @@ class java_event_handler_memory_resource final : public device_memory_resource {
     if (cls == nullptr) {
       throw cudf::jni::jni_exception("class not found");
     }
-    on_alloc_fail_method = env->GetMethodID(cls, "onAllocFailure", "(J)Z");
+    on_alloc_fail_method = env->GetMethodID(cls, "onAllocFailure", "(JI)Z");
     if (on_alloc_fail_method == nullptr) {
-      throw cudf::jni::jni_exception("onAllocFailure method");
+      use_old_alloc_fail_interface = true;
+      on_alloc_fail_method = env->GetMethodID(cls, "onAllocFailure", "(J)Z");
+      if (on_alloc_fail_method == nullptr) {
+        throw cudf::jni::jni_exception("onAllocFailure method");
+      }
+    } else {
+      use_old_alloc_fail_interface = false;
     }
     on_alloc_threshold_method = env->GetMethodID(cls, "onAllocThreshold", "(J)V");
     if (on_alloc_threshold_method == nullptr) {
@@ -187,9 +252,8 @@ class java_event_handler_memory_resource final : public device_memory_resource {
 
 private:
   device_memory_resource *const resource;
-  JavaVM *jvm;
-  jobject handler_obj;
   jmethodID on_alloc_fail_method;
+  bool use_old_alloc_fail_interface;
   jmethodID on_alloc_threshold_method;
   jmethodID on_dealloc_threshold_method;
 
@@ -209,10 +273,18 @@ class java_event_handler_memory_resource final : public device_memory_resource {
     }
   }
 
-  bool on_alloc_fail(std::size_t num_bytes) {
+  bool on_alloc_fail(std::size_t num_bytes, int retry_count) {
     JNIEnv *env = cudf::jni::get_jni_env(jvm);
-    jboolean result =
-        env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes));
+    jboolean result = false;
+    if (!use_old_alloc_fail_interface) {
+      result =
+          env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes),
+                                 static_cast<jint>(retry_count));
+
+    } else {
+      result =
+          env->CallBooleanMethod(handler_obj, on_alloc_fail_method, static_cast<jlong>(num_bytes));
+    }
     if (env->ExceptionCheck()) {
       throw std::runtime_error("onAllocFailure handler threw an exception");
     }
@@ -237,16 +309,32 @@ class java_event_handler_memory_resource final : public device_memory_resource {
     }
   }
 
+  bool supports_get_mem_info() const noexcept override { return resource->supports_get_mem_info(); }
+
+  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override {
+    return resource->get_mem_info(stream);
+  }
+
+  bool supports_streams() const noexcept override { return resource->supports_streams(); }
+
+protected:
+  JavaVM *jvm;
+  jobject handler_obj;
+
   void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
     std::size_t total_before;
     void *result;
+    // a non-zero retry_count signifies that the `on_alloc_fail`
+    // callback is being invoked while re-attempting an allocation
+    // that had previously failed.
+    int retry_count = 0;
     while (true) {
       try {
         total_before = get_total_bytes_allocated();
         result = resource->allocate(num_bytes, stream);
         break;
       } catch (rmm::out_of_memory const &e) {
-        if (!on_alloc_fail(num_bytes)) {
+        if (!on_alloc_fail(num_bytes, retry_count++)) {
           throw;
         }
       }
@@ -272,20 +360,65 @@ class java_event_handler_memory_resource final : public device_memory_resource {
     check_for_threshold_callback(total_after, total_before, dealloc_thresholds,
                                  on_dealloc_threshold_method, "onDeallocThreshold", total_after);
   }
+};
 
-  bool supports_get_mem_info() const noexcept override { return resource->supports_get_mem_info(); }
+class java_debug_event_handler_memory_resource final : public java_event_handler_memory_resource {
+public:
+  java_debug_event_handler_memory_resource(JNIEnv *env, jobject jhandler,
+                                           jlongArray jalloc_thresholds,
+                                           jlongArray jdealloc_thresholds,
+                                           device_memory_resource *resource_to_wrap)
+      : java_event_handler_memory_resource(env, jhandler, jalloc_thresholds, jdealloc_thresholds,
+                                           resource_to_wrap) {
+    jclass cls = env->GetObjectClass(jhandler);
+    if (cls == nullptr) {
+      throw cudf::jni::jni_exception("class not found");
+    }
 
-  std::pair<size_t, size_t> do_get_mem_info(rmm::cuda_stream_view stream) const override {
-    return resource->get_mem_info(stream);
+    on_allocated_method = env->GetMethodID(cls, "onAllocated", "(J)V");
+    if (on_allocated_method == nullptr) {
+      throw cudf::jni::jni_exception("onAllocated method");
+    }
+
+    on_deallocated_method = env->GetMethodID(cls, "onDeallocated", "(J)V");
+    if (on_deallocated_method == nullptr) {
+      throw cudf::jni::jni_exception("onDeallocated method");
+    }
   }
 
-  bool supports_streams() const noexcept override { return resource->supports_streams(); }
+private:
+  jmethodID on_allocated_method;
+  jmethodID on_deallocated_method;
+
+  void on_allocated_callback(std::size_t num_bytes, rmm::cuda_stream_view stream) {
+    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+    env->CallVoidMethod(handler_obj, on_allocated_method, num_bytes);
+    if (env->ExceptionCheck()) {
+      throw std::runtime_error("onAllocated handler threw an exception");
+    }
+  }
+
+  void on_deallocated_callback(void *p, std::size_t size, rmm::cuda_stream_view stream) {
+    JNIEnv *env = cudf::jni::get_jni_env(jvm);
+    env->CallVoidMethod(handler_obj, on_deallocated_method, size);
+  }
+
+  void *do_allocate(std::size_t num_bytes, rmm::cuda_stream_view stream) override {
+    void *result = java_event_handler_memory_resource::do_allocate(num_bytes, stream);
+    on_allocated_callback(num_bytes, stream);
+    return result;
+  }
+
+  void do_deallocate(void *p, std::size_t size, rmm::cuda_stream_view stream) override {
+    java_event_handler_memory_resource::do_deallocate(p, size, stream);
+    on_deallocated_callback(p, size, stream);
+  }
 };
 
 std::unique_ptr<java_event_handler_memory_resource> Java_memory_resource{};
 
 void set_java_device_memory_resource(JNIEnv *env, jobject handler_obj, jlongArray jalloc_thresholds,
-                                     jlongArray jdealloc_thresholds) {
+                                     jlongArray jdealloc_thresholds, jboolean enable_debug) {
   if (Java_memory_resource && handler_obj != nullptr) {
     JNI_THROW_NEW(env, RMM_EXCEPTION_CLASS, "Another event handler is already set", )
   }
@@ -302,8 +435,13 @@ void set_java_device_memory_resource(JNIEnv *env, jobject handler_obj, jlongArra
   }
   if (handler_obj != nullptr) {
     auto resource = rmm::mr::get_current_device_resource();
-    Java_memory_resource.reset(new java_event_handler_memory_resource(
-        env, handler_obj, jalloc_thresholds, jdealloc_thresholds, resource));
+    if (enable_debug) {
+      Java_memory_resource.reset(new java_debug_event_handler_memory_resource(
+          env, handler_obj, jalloc_thresholds, jdealloc_thresholds, resource));
+    } else {
+      Java_memory_resource.reset(new java_event_handler_memory_resource(
+          env, handler_obj, jalloc_thresholds, jdealloc_thresholds, resource));
+    }
     auto replaced_resource = rmm::mr::set_current_device_resource(Java_memory_resource.get());
     if (resource != replaced_resource) {
       rmm::mr::set_current_device_resource(replaced_resource);
@@ -417,7 +555,7 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_initializeInternal(JNIEnv *env, j
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_shutdownInternal(JNIEnv *env, jclass clazz) {
   try {
     cudf::jni::auto_set_device(env);
-    set_java_device_memory_resource(env, nullptr, nullptr, nullptr);
+    set_java_device_memory_resource(env, nullptr, nullptr, nullptr, false);
     // Instead of trying to undo all of the adaptors that we added in reverse order
     // we just reset the base adaptor so the others will not be called any more
     // and then clean them up in really any order.  There should be no interaction with
@@ -436,6 +574,20 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getTotalBytesAllocated(JNIEnv *e
   return get_total_bytes_allocated();
 }
 
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getMaximumTotalBytesAllocated(JNIEnv *env, jclass) {
+  return get_max_total_allocated();
+}
+
+JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_resetScopedMaximumBytesAllocatedInternal(
+    JNIEnv *env, jclass, jlong initialValue) {
+  reset_scoped_max_total_allocated(initialValue);
+}
+
+JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_getScopedMaximumBytesAllocated(JNIEnv *env,
+                                                                               jclass) {
+  return get_scoped_max_total_allocated();
+}
+
 JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Rmm_allocInternal(JNIEnv *env, jclass clazz, jlong size,
                                                               jlong stream) {
   try {
@@ -472,9 +624,10 @@ JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_freeDeviceBuffer(JNIEnv *env, jcl
 
 JNIEXPORT void JNICALL Java_ai_rapids_cudf_Rmm_setEventHandlerInternal(
     JNIEnv *env, jclass, jobject handler_obj, jlongArray jalloc_thresholds,
-    jlongArray jdealloc_thresholds) {
+    jlongArray jdealloc_thresholds, jboolean enable_debug) {
   try {
-    set_java_device_memory_resource(env, handler_obj, jalloc_thresholds, jdealloc_thresholds);
+    set_java_device_memory_resource(env, handler_obj, jalloc_thresholds, jdealloc_thresholds,
+                                    enable_debug);
   }
   CATCH_STD(env, )
 }
diff --git a/java/src/main/native/src/ScalarJni.cpp b/java/src/main/native/src/ScalarJni.cpp
index b44d2604882..e47728f6acc 100644
--- a/java/src/main/native/src/ScalarJni.cpp
+++ b/java/src/main/native/src/ScalarJni.cpp
@@ -514,8 +514,8 @@ JNIEXPORT jlong JNICALL Java_ai_rapids_cudf_Scalar_binaryOpSV(JNIEnv *env, jclas
 
       auto lhs_col = cudf::make_column_from_scalar(*lhs, 1);
       auto out_view = out->mutable_view();
-      cudf::binops::compiled::detail::apply_sorting_struct_binary_op(out_view, lhs_col->view(),
-                                                                     *rhs, true, false, op);
+      cudf::binops::compiled::detail::apply_sorting_struct_binary_op(
+          out_view, lhs_col->view(), *rhs, true, false, op, cudf::get_default_stream());
       return release_as_jlong(out);
     }
 
diff --git a/java/src/main/native/src/TableJni.cpp b/java/src/main/native/src/TableJni.cpp
index ad280cad5fd..b70a7b5a615 100644
--- a/java/src/main/native/src/TableJni.cpp
+++ b/java/src/main/native/src/TableJni.cpp
@@ -258,7 +258,15 @@ class native_arrow_ipc_writer_handle final {
       writer = *tmp_writer;
       initialized = true;
     }
-    writer->WriteTable(*arrow_tab, max_chunk);
+    if (arrow_tab->num_rows() == 0) {
+      // Arrow C++ IPC writer will not write an empty batch in the case of an
+      // empty table, so need to write an empty batch explicitly.
+      // For more please see https://issues.apache.org/jira/browse/ARROW-17912.
+      auto empty_batch = arrow::RecordBatch::MakeEmpty(arrow_tab->schema());
+      writer->WriteRecordBatch(*(*empty_batch));
+    } else {
+      writer->WriteTable(*arrow_tab, max_chunk);
+    }
   }
 
   void close() {
@@ -2971,8 +2979,7 @@ Java_ai_rapids_cudf_Table_convertToRowsFixedWidthOptimized(JNIEnv *env, jclass,
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env, jclass,
                                                                     jlong j_input, jlong j_map,
-                                                                    jlong j_target,
-                                                                    jboolean check_bounds) {
+                                                                    jlong j_target) {
   JNI_NULL_CHECK(env, j_input, "input table is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   JNI_NULL_CHECK(env, j_target, "target table is null", 0);
@@ -2981,15 +2988,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterTable(JNIEnv *env,
     auto const input = reinterpret_cast<cudf::table_view const *>(j_input);
     auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
     auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
-    return convert_table_for_return(env, cudf::scatter(*input, *map, *target, check_bounds));
+    return convert_table_for_return(env, cudf::scatter(*input, *map, *target));
   }
   CATCH_STD(env, 0);
 }
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *env, jclass,
                                                                       jlongArray j_input,
-                                                                      jlong j_map, jlong j_target,
-                                                                      jboolean check_bounds) {
+                                                                      jlong j_map, jlong j_target) {
   JNI_NULL_CHECK(env, j_input, "input scalars array is null", 0);
   JNI_NULL_CHECK(env, j_map, "map column is null", 0);
   JNI_NULL_CHECK(env, j_target, "target table is null", 0);
@@ -3001,7 +3007,7 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_scatterScalars(JNIEnv *en
                    [](auto &scalar) { return std::ref(*scalar); });
     auto const map = reinterpret_cast<cudf::column_view const *>(j_map);
     auto const target = reinterpret_cast<cudf::table_view const *>(j_target);
-    return convert_table_for_return(env, cudf::scatter(input, *map, *target, check_bounds));
+    return convert_table_for_return(env, cudf::scatter(input, *map, *target));
   }
   CATCH_STD(env, 0);
 }
@@ -3086,15 +3092,14 @@ JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatStaticCount(JNIEnv
 
 JNIEXPORT jlongArray JNICALL Java_ai_rapids_cudf_Table_repeatColumnCount(JNIEnv *env, jclass,
                                                                          jlong input_jtable,
-                                                                         jlong count_jcol,
-                                                                         jboolean check_count) {
+                                                                         jlong count_jcol) {
   JNI_NULL_CHECK(env, input_jtable, "input table is null", 0);
   JNI_NULL_CHECK(env, count_jcol, "count column is null", 0);
   try {
     cudf::jni::auto_set_device(env);
     auto const input = reinterpret_cast<cudf::table_view const *>(input_jtable);
     auto const count = reinterpret_cast<cudf::column_view const *>(count_jcol);
-    return convert_table_for_return(env, cudf::repeat(*input, *count, check_count));
+    return convert_table_for_return(env, cudf::repeat(*input, *count));
   }
   CATCH_STD(env, 0);
 }
@@ -3462,7 +3467,7 @@ JNIEXPORT jobject JNICALL Java_ai_rapids_cudf_Table_contiguousSplitGroups(
       auto const size = cudf::distance(begin, end);
       auto const vec = thrust::host_vector<cudf::size_type>(begin, end);
       auto buf = rmm::device_buffer{vec.data(), size * sizeof(cudf::size_type),
-                                    cudf::default_stream_value};
+                                    cudf::get_default_stream()};
       auto gather_map_col = std::make_unique<cudf::column>(cudf::data_type{cudf::type_id::INT32},
                                                            size, std::move(buf));
 
diff --git a/java/src/main/native/src/aggregation128_utils.hpp b/java/src/main/native/src/aggregation128_utils.hpp
index 70658976dad..a1437606cdf 100644
--- a/java/src/main/native/src/aggregation128_utils.hpp
+++ b/java/src/main/native/src/aggregation128_utils.hpp
@@ -41,7 +41,7 @@ namespace cudf::jni {
  */
 std::unique_ptr<cudf::column>
 extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_idx,
-                rmm::cuda_stream_view stream = cudf::default_stream_value);
+                rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 /**
  * @brief Reassemble a 128-bit column from four 64-bit integer columns with overflow detection.
@@ -65,6 +65,6 @@ extract_chunk32(cudf::column_view const &col, cudf::data_type dtype, int chunk_i
  */
 std::unique_ptr<cudf::table>
 assemble128_from_sum(cudf::table_view const &chunks_table, cudf::data_type output_type,
-                     rmm::cuda_stream_view stream = cudf::default_stream_value);
+                     rmm::cuda_stream_view stream = cudf::get_default_stream());
 
 } // namespace cudf::jni
diff --git a/java/src/main/native/src/dtype_utils.hpp b/java/src/main/native/src/dtype_utils.hpp
index 53108ee7268..4de8a94182c 100644
--- a/java/src/main/native/src/dtype_utils.hpp
+++ b/java/src/main/native/src/dtype_utils.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2020, NVIDIA CORPORATION.
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
  *
  * Licensed under the Apache License, Version 2.0 (the "License");
  * you may not use this file except in compliance with the License.
@@ -62,4 +62,4 @@ inline cudf::data_type make_data_type(jint out_dtype, jint scale) {
 }
 
 } // namespace jni
-} // namespace cudf
\ No newline at end of file
+} // namespace cudf
diff --git a/java/src/main/native/src/row_conversion.cu b/java/src/main/native/src/row_conversion.cu
index 578915ee2ce..3913de720f9 100644
--- a/java/src/main/native/src/row_conversion.cu
+++ b/java/src/main/native/src/row_conversion.cu
@@ -1885,7 +1885,7 @@ std::vector<std::unique_ptr<column>> convert_to_rows(
 
                    return make_lists_column(
                        batch_info.row_batches[batch].row_count, std::move(offsets), std::move(data),
-                       0, rmm::device_buffer{0, cudf::default_stream_value, mr}, stream, mr);
+                       0, rmm::device_buffer{0, cudf::get_default_stream(), mr}, stream, mr);
                  });
 
   return ret;
diff --git a/java/src/main/native/src/row_conversion.hpp b/java/src/main/native/src/row_conversion.hpp
index e260ea44089..e4631875152 100644
--- a/java/src/main/native/src/row_conversion.hpp
+++ b/java/src/main/native/src/row_conversion.hpp
@@ -29,23 +29,23 @@ namespace jni {
 std::vector<std::unique_ptr<cudf::column>> convert_to_rows_fixed_width_optimized(
     cudf::table_view const &tbl,
     // TODO need something for validity
-    rmm::cuda_stream_view stream = cudf::default_stream_value,
+    rmm::cuda_stream_view stream = cudf::get_default_stream(),
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 std::vector<std::unique_ptr<cudf::column>>
 convert_to_rows(cudf::table_view const &tbl,
                 // TODO need something for validity
-                rmm::cuda_stream_view stream = cudf::default_stream_value,
+                rmm::cuda_stream_view stream = cudf::get_default_stream(),
                 rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::table> convert_from_rows_fixed_width_optimized(
     cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
-    rmm::cuda_stream_view stream = cudf::default_stream_value,
+    rmm::cuda_stream_view stream = cudf::get_default_stream(),
     rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 std::unique_ptr<cudf::table>
 convert_from_rows(cudf::lists_column_view const &input, std::vector<cudf::data_type> const &schema,
-                  rmm::cuda_stream_view stream = cudf::default_stream_value,
+                  rmm::cuda_stream_view stream = cudf::get_default_stream(),
                   rmm::mr::device_memory_resource *mr = rmm::mr::get_current_device_resource());
 
 } // namespace jni
diff --git a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
index 7afd5abb358..e1ed5e12fc2 100644
--- a/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/ColumnVectorTest.java
@@ -3942,7 +3942,7 @@ void testCastStringToByteList() {
     "\\THE\t8\ud720", "tést strings", "", "éé");
         ColumnVector res = cv.asByteList(true);
         ColumnVector expected = ColumnVector.fromLists(new HostColumnVector.ListType(true,
-          new HostColumnVector.BasicType(true, DType.INT8)), list1, list2, list3, list4, list5,
+          new HostColumnVector.BasicType(true, DType.UINT8)), list1, list2, list3, list4, list5,
           list6, list7, list8)) {
       assertColumnsAreEqual(expected, res);
     }
@@ -4193,6 +4193,63 @@ void testContainsReEmptyInput() {
     }
   }
 
+  @Test
+  void testLike() {
+    // Default escape character
+    try (ColumnVector testStrings = ColumnVector.fromStrings(
+           "a", "aa", "aaa", "aba", "b", "bb", "bba", "", "áéêú", "a1b2c3");
+         Scalar patternString1 = Scalar.fromString("a1b2c3");
+         Scalar patternString2 = Scalar.fromString("__a%");
+         Scalar defaultEscape = Scalar.fromString("\\");
+         ColumnVector res1 = testStrings.like(patternString1, defaultEscape);
+         ColumnVector res2 = testStrings.like(patternString2, defaultEscape);
+         ColumnVector expected1 = ColumnVector.fromBoxedBooleans(
+           false, false, false, false, false, false, false, false, false, true);
+         ColumnVector expected2 = ColumnVector.fromBoxedBooleans(
+           false, false, true, true, false, false, true, false, false, false)) {
+      assertColumnsAreEqual(expected1, res1);
+      assertColumnsAreEqual(expected2, res2);
+    }
+    // Non-default escape character
+    try (ColumnVector testStrings = ColumnVector.fromStrings(
+           "10%-20%", "10-20", "10%%-20%", "a_b", "b_a", "___", "", "aéb", "_%_", "_%a");
+         Scalar patternString1 = Scalar.fromString("10%%%%-20%%");
+         Scalar patternString2 = Scalar.fromString("___%%");
+         Scalar escapeChar1 = Scalar.fromString("%");
+         Scalar escapeChar2 = Scalar.fromString("_");
+         ColumnVector res1 = testStrings.like(patternString1, escapeChar1);
+         ColumnVector res2 = testStrings.like(patternString2, escapeChar2);
+         ColumnVector expected1 = ColumnVector.fromBoxedBooleans(
+           false, false, true, false, false, false, false, false, false, false);
+         ColumnVector expected2 = ColumnVector.fromBoxedBooleans(
+           false, false, false, false, false, false, false, false, true, true)) {
+      assertColumnsAreEqual(expected1, res1);
+      assertColumnsAreEqual(expected2, res2);
+    }
+    assertThrows(AssertionError.class, () -> {
+      try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
+           Scalar defaultEscape = Scalar.fromString("\\");
+           ColumnVector res = testStrings.like(null, defaultEscape)) {}
+    });
+    assertThrows(AssertionError.class, () -> {
+      try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
+           Scalar patternString = Scalar.fromString("");
+           ColumnVector res = testStrings.like(patternString, null)) {}
+    });
+    assertThrows(AssertionError.class, () -> {
+      try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
+           Scalar patternString = Scalar.fromString("");
+           Scalar intScalar = Scalar.fromInt(1);
+           ColumnVector res = testStrings.like(patternString, intScalar)) {}
+    });
+    assertThrows(AssertionError.class, () -> {
+      try (ColumnVector testStrings = ColumnVector.fromStrings("a", "B", "cd", null, "");
+           Scalar intScalar = Scalar.fromInt(1);
+           Scalar defaultEscape = Scalar.fromString("\\");
+           ColumnVector res = testStrings.like(intScalar, defaultEscape)) {}
+    });
+  }
+
   @Test
   void testUrlDecode() {
     String[] inputs = new String[] {
diff --git a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
index df710c71f63..c332ce660d1 100644
--- a/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
+++ b/java/src/test/java/ai/rapids/cudf/MemoryBufferTest.java
@@ -1,6 +1,6 @@
 /*
  *
- *  Copyright (c) 2021, NVIDIA CORPORATION.
+ *  Copyright (c) 2021-2022, NVIDIA CORPORATION.
  *
  *  Licensed under the Apache License, Version 2.0 (the "License");
  *  you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@
 
 import org.junit.jupiter.api.Test;
 
+import java.util.concurrent.atomic.AtomicInteger;
+
 import static org.junit.jupiter.api.Assertions.*;
 
 public class MemoryBufferTest extends CudfTestBase {
@@ -168,4 +170,49 @@ private void verifyOutput(HostMemoryBuffer out) {
     out.getBytes(bytes, 0, 0, 16);
     assertArrayEquals(EXPECTED, bytes);
   }
+
+  @Test
+  public void testEventHandlerIsCalledForEachClose() {
+    final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
+    try (DeviceMemoryBuffer b = DeviceMemoryBuffer.allocate(256)) {
+      b.setEventHandler(refCount -> onClosedWasCalled.incrementAndGet());
+    }
+    assertEquals(1, onClosedWasCalled.get());
+    onClosedWasCalled.set(0);
+
+    try (DeviceMemoryBuffer b = DeviceMemoryBuffer.allocate(256)) {
+      b.setEventHandler(refCount -> onClosedWasCalled.incrementAndGet());
+      DeviceMemoryBuffer sliced = b.slice(0, b.getLength());
+      sliced.close();
+    }
+    assertEquals(2, onClosedWasCalled.get());
+  }
+
+  @Test
+  public void testEventHandlerIsNotCalledIfNotSet() {
+    final AtomicInteger onClosedWasCalled = new AtomicInteger(0);
+    try (DeviceMemoryBuffer b = DeviceMemoryBuffer.allocate(256)) {
+      assertNull(b.getEventHandler());
+    }
+    assertEquals(0, onClosedWasCalled.get());
+    try (DeviceMemoryBuffer b = DeviceMemoryBuffer.allocate(256)) {
+      b.setEventHandler(refCount -> onClosedWasCalled.incrementAndGet());
+      b.setEventHandler(null);
+    }
+    assertEquals(0, onClosedWasCalled.get());
+  }
+
+  @Test
+  public void testEventHandlerReturnsPreviousHandlerOnReset() {
+    try (DeviceMemoryBuffer b = DeviceMemoryBuffer.allocate(256)) {
+      MemoryBuffer.EventHandler handler = refCount -> {};
+      MemoryBuffer.EventHandler handler2 = refCount -> {};
+
+      assertNull(b.setEventHandler(handler));
+      assertEquals(handler, b.setEventHandler(null));
+
+      assertNull(b.setEventHandler(handler2));
+      assertEquals(handler2, b.setEventHandler(handler));
+    }
+  }
 }
diff --git a/java/src/test/java/ai/rapids/cudf/PinnedMemoryPoolTest.java b/java/src/test/java/ai/rapids/cudf/PinnedMemoryPoolTest.java
index 66233c65362..16628d7be36 100644
--- a/java/src/test/java/ai/rapids/cudf/PinnedMemoryPoolTest.java
+++ b/java/src/test/java/ai/rapids/cudf/PinnedMemoryPoolTest.java
@@ -131,4 +131,4 @@ void testZeroSizedAllocation() {
     }
     assertEquals(poolSize, PinnedMemoryPool.getAvailableBytes());
   }
-}
\ No newline at end of file
+}
diff --git a/java/src/test/java/ai/rapids/cudf/RmmTest.java b/java/src/test/java/ai/rapids/cudf/RmmTest.java
index c56b131de86..c081f51c9f2 100644
--- a/java/src/test/java/ai/rapids/cudf/RmmTest.java
+++ b/java/src/test/java/ai/rapids/cudf/RmmTest.java
@@ -65,6 +65,89 @@ public void testTotalAllocated(int rmmAllocMode) {
     assertEquals(0, Rmm.getTotalBytesAllocated());
   }
 
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testMaxOutstanding(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024)) {
+      assertEquals(1024, Rmm.getMaximumTotalBytesAllocated());
+    }
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+    assertEquals(1024, Rmm.getMaximumTotalBytesAllocated());
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testScopedMaxOutstanding(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
+    }
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+    assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
+
+    Rmm.resetScopedMaximumBytesAllocated();
+    assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
+    assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
+
+    DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+    ignored.close();
+    assertEquals(1024, Rmm.getScopedMaximumBytesAllocated());
+    assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
+    assertEquals(0, Rmm.getTotalBytesAllocated());
+
+    // a non-zero value is the new minimum
+    DeviceMemoryBuffer ignored2 = Rmm.alloc(1024);
+    ignored2.close();
+    Rmm.resetScopedMaximumBytesAllocated(10000);
+    assertEquals(10000, Rmm.getScopedMaximumBytesAllocated());
+    assertEquals(2048, Rmm.getMaximumTotalBytesAllocated());
+
+    try(DeviceMemoryBuffer ignored3 = Rmm.alloc(1024)) {
+      Rmm.resetScopedMaximumBytesAllocated(1024);
+      try (DeviceMemoryBuffer ignored4 = Rmm.alloc(20480)) {
+        assertEquals(21504, Rmm.getScopedMaximumBytesAllocated());
+        assertEquals(21504, Rmm.getMaximumTotalBytesAllocated());
+      }
+    }
+  }
+
+  @ParameterizedTest
+  @ValueSource(ints = {
+      RmmAllocationMode.CUDA_DEFAULT,
+      RmmAllocationMode.POOL,
+      RmmAllocationMode.ARENA})
+  public void testScopedMaxOutstandingNegative(int rmmAllocMode) {
+    Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
+    assertEquals(0, Rmm.getMaximumTotalBytesAllocated());
+    try (DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+         DeviceMemoryBuffer ignored2 = Rmm.alloc(1024)) {
+      assertEquals(2048, Rmm.getScopedMaximumBytesAllocated());
+      Rmm.resetScopedMaximumBytesAllocated();
+      assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
+    }
+    // because we allocated a net -2048 Bytes since reset
+    assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
+    DeviceMemoryBuffer ignored = Rmm.alloc(1024);
+    ignored.close();
+    assertEquals(0, Rmm.getScopedMaximumBytesAllocated());
+
+    // if we allocate 2KB and then 256B we start seeing a positive local maximum
+    try (DeviceMemoryBuffer ignored2 = Rmm.alloc(2048);
+         DeviceMemoryBuffer ignored3 = Rmm.alloc(256)) {
+      assertEquals(256, Rmm.getScopedMaximumBytesAllocated());
+    }
+  }
+
   @ParameterizedTest
   @ValueSource(ints = {
       RmmAllocationMode.CUDA_DEFAULT,
@@ -73,14 +156,28 @@ public void testTotalAllocated(int rmmAllocMode) {
   public void testEventHandler(int rmmAllocMode) {
     AtomicInteger invokedCount = new AtomicInteger();
     AtomicLong amountRequested = new AtomicLong();
+    AtomicInteger timesRetried = new AtomicInteger();
+    AtomicLong totalAllocated = new AtomicLong();
+    AtomicLong totalDeallocated = new AtomicLong();
 
     RmmEventHandler handler = new BaseRmmEventHandler() {
       @Override
-      public boolean onAllocFailure(long sizeRequested) {
+      public boolean onAllocFailure(long sizeRequested, int retryCount) {
         int count = invokedCount.incrementAndGet();
+        timesRetried.set(retryCount);
         amountRequested.set(sizeRequested);
         return count != 3;
       }
+
+      @Override
+      public void onAllocated(long sizeAllocated) {
+        totalAllocated.addAndGet(sizeAllocated);
+      }
+
+      @Override
+      public void onDeallocated(long sizeDeallocated) {
+        totalDeallocated.addAndGet(sizeDeallocated);
+      }
     };
 
     Rmm.initialize(rmmAllocMode, Rmm.logToStderr(), 512 * 1024 * 1024);
@@ -90,6 +187,10 @@ public boolean onAllocFailure(long sizeRequested) {
     assertTrue(addr.address != 0);
     assertEquals(0, invokedCount.get());
 
+    // by default, we don't get callbacks on allocated or deallocated
+    assertEquals(0, totalAllocated.get());
+    assertEquals(0, totalDeallocated.get());
+
     // Try to allocate too much
     long requested = TOO_MUCH_MEMORY;
     try {
@@ -100,12 +201,21 @@ public boolean onAllocFailure(long sizeRequested) {
     }
 
     assertEquals(3, invokedCount.get());
+    assertEquals(2, timesRetried.get());
     assertEquals(requested, amountRequested.get());
 
     // verify after a failure we can still allocate something more reasonable
     requested = 8192;
     addr = Rmm.alloc(requested);
     addr.close();
+
+    // test the debug event handler
+    Rmm.clearEventHandler();
+    Rmm.setEventHandler(handler, /*enableDebug*/ true);
+    addr = Rmm.alloc(1024);
+    addr.close();
+    assertEquals(1024, totalAllocated.get());
+    assertEquals(1024, totalDeallocated.get());
   }
 
   @Test
@@ -114,7 +224,7 @@ public void testSetEventHandlerTwice() {
     // installing an event handler the first time should not be an error
     Rmm.setEventHandler(new BaseRmmEventHandler() {
       @Override
-      public boolean onAllocFailure(long sizeRequested) {
+      public boolean onAllocFailure(long sizeRequested, int retryCount) {
         return false;
       }
     });
@@ -122,7 +232,7 @@ public boolean onAllocFailure(long sizeRequested) {
     // installing a second event handler is an error
     RmmEventHandler otherHandler = new BaseRmmEventHandler() {
       @Override
-      public boolean onAllocFailure(long sizeRequested) {
+      public boolean onAllocFailure(long sizeRequested, int retryCount) {
         return true;
       }
     };
@@ -138,7 +248,7 @@ public void testClearEventHandler() {
     // create an event handler that will always retry
     RmmEventHandler retryHandler = new BaseRmmEventHandler() {
       @Override
-      public boolean onAllocFailure(long sizeRequested) {
+      public boolean onAllocFailure(long sizeRequested, int retryCount) {
         return true;
       }
     };
@@ -165,7 +275,7 @@ public void testAllocOnlyThresholds() {
 
     RmmEventHandler handler = new RmmEventHandler() {
       @Override
-      public boolean onAllocFailure(long sizeRequested) {
+      public boolean onAllocFailure(long sizeRequested, int retryCount) {
         return false;
       }
 
@@ -228,7 +338,7 @@ public void testThresholds() {
 
     RmmEventHandler handler = new RmmEventHandler() {
       @Override
-      public boolean onAllocFailure(long sizeRequested) {
+      public boolean onAllocFailure(long sizeRequested, int retryCount) {
         return false;
       }
 
@@ -308,7 +418,7 @@ public void testExceptionHandling() {
 
     RmmEventHandler handler = new RmmEventHandler() {
       @Override
-      public boolean onAllocFailure(long sizeRequested) {
+      public boolean onAllocFailure(long sizeRequested, int retryCount) {
         throw new AllocFailException();
       }
 
diff --git a/java/src/test/java/ai/rapids/cudf/TableTest.java b/java/src/test/java/ai/rapids/cudf/TableTest.java
index 194c1094caf..bf951a871e7 100644
--- a/java/src/test/java/ai/rapids/cudf/TableTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TableTest.java
@@ -75,6 +75,7 @@
 
 public class TableTest extends CudfTestBase {
   private static final File TEST_PARQUET_FILE = TestUtils.getResourceAsFile("acq.parquet");
+  private static final File TEST_PARQUET_FILE_CHUNKED_READ = TestUtils.getResourceAsFile("splittable.parquet");
   private static final File TEST_PARQUET_FILE_BINARY = TestUtils.getResourceAsFile("binary.parquet");
   private static final File TEST_ORC_FILE = TestUtils.getResourceAsFile("TestOrcFile.orc");
   private static final File TEST_ORC_TIMESTAMP_DATE_FILE = TestUtils.getResourceAsFile("timestamp-date-test.orc");
@@ -621,7 +622,7 @@ void testParquetWriteToBufferChunkedBinary() {
     List<Byte> bin2 = asList(string2);
 
     try (Table binTable = new Table.TestBuilder()
-        .column(new ListType(true, new BasicType(false, DType.INT8)),
+        .column(new ListType(true, new BasicType(false, DType.UINT8)),
             bin1, bin2)
         .build();
          Table stringTable = new Table.TestBuilder()
@@ -725,6 +726,23 @@ void testReadParquetContainsDecimalData() {
     }
   }
 
+  @Test
+  void testChunkedReadParquet() {
+    try (ParquetChunkedReader reader = new ParquetChunkedReader(240000,
+        TEST_PARQUET_FILE_CHUNKED_READ)) {
+      int numChunks = 0;
+      long totalRows = 0;
+      while(reader.hasNext()) {
+        ++numChunks;
+        try(Table chunk = reader.readChunk()) {
+          totalRows += chunk.getRowCount();
+        }
+      }
+      assertEquals(2, numChunks);
+      assertEquals(40000, totalRows);
+    }
+  }
+
   @Test
   void testReadAvro() {
     AvroOptions opts = AvroOptions.builder()
@@ -2689,17 +2707,6 @@ void testRepeatColumn() {
     }
   }
 
-  @Test
-  void testRepeatColumnBad() {
-    try (Table t = new Table.TestBuilder()
-            .column(1, 2)
-            .column("a", "b")
-            .build();
-         ColumnVector repeats = ColumnVector.fromBytes((byte)2, (byte)-1)) {
-      assertThrows(CudfException.class, () -> t.repeat(repeats));
-    }
-  }
-
   @Test
   void testInterleaveIntColumns() {
     try (Table t = new Table.TestBuilder()
@@ -5798,8 +5805,8 @@ private static Scalar getDecimalScalarRangeBounds(int scale, int unscaledValue,
       case 2: return Scalar.fromDecimal(scale, unscaledValue);
       case 3: return Scalar.fromDecimal(scale, Long.valueOf(unscaledValue));
       case 4: return Scalar.fromDecimal(scale, big(unscaledValue));
-      default: 
-        throw new IllegalStateException("Unexpected order by column index: " 
+      default:
+        throw new IllegalStateException("Unexpected order by column index: "
                                         + orderby_col_idx);
     }
   }
@@ -5809,11 +5816,11 @@ void testRangeWindowsWithDecimalOrderBy() {
     try (Table unsorted = new Table.TestBuilder()
         .column(1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1) // GBY Key
         .column(1, 1, 1, 1, 2, 2, 2, 2, 3, 3, 3, 3) // GBY Key
-        .decimal32Column(-1, 4000, 3000, 2000, 1000, 
-                             4000, 3000, 2000, 1000, 
+        .decimal32Column(-1, 4000, 3000, 2000, 1000,
+                             4000, 3000, 2000, 1000,
                              4000, 3000, 2000, 1000) // Decimal OBY Key
-        .decimal64Column(-1, 4000l, 3000l, 2000l, 1000l, 
-                             4000l, 3000l, 2000l, 1000l, 
+        .decimal64Column(-1, 4000l, 3000l, 2000l, 1000l,
+                             4000l, 3000l, 2000l, 1000l,
                              4000l, 3000l, 2000l, 1000l) // Decimal OBY Key
         .decimal128Column(-1, RoundingMode.UNNECESSARY,
                               big(4000), big(3000), big(2000), big(1000),
@@ -5822,13 +5829,13 @@ void testRangeWindowsWithDecimalOrderBy() {
         .column(9, 1, 5, 7, 2, 8, 9, 7, 6, 6, 0, 8) // Agg Column
         .build()) {
 
-      // Columns 2,3,4 are decimal order-by columns of type DECIMAL32, DECIMAL64, 
+      // Columns 2,3,4 are decimal order-by columns of type DECIMAL32, DECIMAL64,
       // and DECIMAL128 respectively, with similarly ordered values.
       // In the following loop, each decimal type is tested as the order-by column,
       // producing the same results with similar range bounds.
       for (int decimal_oby_col_idx = 2; decimal_oby_col_idx <= 4; ++decimal_oby_col_idx) {
-        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0), 
-                                             OrderByArg.asc(1), 
+        try (Table sorted = unsorted.orderBy(OrderByArg.asc(0),
+                                             OrderByArg.asc(1),
                                              OrderByArg.asc(decimal_oby_col_idx));
             ColumnVector expectSortedAggColumn = ColumnVector.fromBoxedInts(7, 5, 1, 9, 7, 9, 8, 2, 8, 0, 6, 6)) {
           ColumnVector sortedAggColumn = sorted.getColumn(5);
@@ -6963,7 +6970,7 @@ void testScatterTable() {
             .decimal32Column(-3, 1, -2, 2, 4, 3)
             .decimal64Column(-8, 100001L, -200002L, 200002L, 400004L, 300003L)
             .build();
-         Table result = srcTable.scatter(scatterMap, targetTable, false)) {
+         Table result = srcTable.scatter(scatterMap, targetTable)) {
       assertTablesAreEqual(expected, result);
     }
   }
@@ -6981,7 +6988,7 @@ void testScatterScalars() {
             .column(0, -2, 0, -4, 0)
             .column("A", "BB", "A", "BBBB", "A")
             .build();
-         Table result = Table.scatter(new Scalar[] { s1, s2 }, scatterMap, targetTable, false)) {
+         Table result = Table.scatter(new Scalar[] { s1, s2 }, scatterMap, targetTable)) {
        assertTablesAreEqual(expected, result);
      }
   }
@@ -7937,6 +7944,35 @@ void testArrowIPCWriteToBufferChunked() {
     }
   }
 
+  @Test
+  void testArrowIPCWriteEmptyToBufferChunked() {
+    try (Table emptyTable = new Table.TestBuilder().timestampDayColumn().build();
+         MyBufferConsumer consumer = new MyBufferConsumer()) {
+      ArrowIPCWriterOptions options = ArrowIPCWriterOptions.builder()
+              .withColumnNames("day")
+              .build();
+      try (TableWriter writer = Table.writeArrowIPCChunked(options, consumer)) {
+        writer.write(emptyTable);
+      }
+      try (StreamedTableReader reader = Table.readArrowIPCChunked(new MyBufferProvider(consumer))) {
+        boolean done = false;
+        int count = 0;
+        while (!done) {
+          try (Table t = reader.getNextIfAvailable()) {
+            if (t == null) {
+              done = true;
+            } else {
+              assertTablesAreEqual(emptyTable, t);
+              count++;
+            }
+          }
+        }
+        // Expect one empty batch for the empty table.
+        assertEquals(1, count);
+      }
+    }
+  }
+
   @Test
   void testORCWriteToBufferChunked() {
     String[] selectedColumns = WriteUtils.getAllColumns(false);
diff --git a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
index 9a929cec98d..c22acac747e 100644
--- a/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
+++ b/java/src/test/java/ai/rapids/cudf/TimestampColumnVectorTest.java
@@ -30,7 +30,7 @@ public class TimestampColumnVectorTest extends CudfTestBase {
                                   17716,    //2018-07-04
                                   19382,    //2023-01-25
                                   -1528,    //1965-10-26
-                                  17716};   //2018-07-04 
+                                  17716};   //2018-07-04
 
   static final long[] TIMES_S = {-131968728L,   //'1965-10-26 14:01:12' Tuesday
                                  1530705600L,   //'2018-07-04 12:00:00' Wednesday
diff --git a/java/src/test/resources/splittable.parquet b/java/src/test/resources/splittable.parquet
new file mode 100644
index 00000000000..0f110ee1000
Binary files /dev/null and b/java/src/test/resources/splittable.parquet differ
diff --git a/notebooks/10min.ipynb b/notebooks/10min.ipynb
new file mode 120000
index 00000000000..bd57fc7375e
--- /dev/null
+++ b/notebooks/10min.ipynb
@@ -0,0 +1 @@
+../docs/cudf/source/user_guide/10min.ipynb
\ No newline at end of file
diff --git a/notebooks/cupy-interop.ipynb b/notebooks/cupy-interop.ipynb
new file mode 120000
index 00000000000..0ba88107fc5
--- /dev/null
+++ b/notebooks/cupy-interop.ipynb
@@ -0,0 +1 @@
+../docs/cudf/source/user_guide/cupy-interop.ipynb
\ No newline at end of file
diff --git a/notebooks/guide-to-udfs.ipynb b/notebooks/guide-to-udfs.ipynb
new file mode 120000
index 00000000000..a4bbe597fee
--- /dev/null
+++ b/notebooks/guide-to-udfs.ipynb
@@ -0,0 +1 @@
+../docs/cudf/source/user_guide/guide-to-udfs.ipynb
\ No newline at end of file
diff --git a/notebooks/missing-data.ipynb b/notebooks/missing-data.ipynb
new file mode 120000
index 00000000000..7e3b01ae0b3
--- /dev/null
+++ b/notebooks/missing-data.ipynb
@@ -0,0 +1 @@
+../docs/cudf/source/user_guide/missing-data.ipynb
\ No newline at end of file
diff --git a/pyproject.toml b/pyproject.toml
index a756854eae7..dfd22f33785 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -2,7 +2,7 @@
 line-length = 79
 target-version = ["py38"]
 include = '\.py?$'
-exclude = '''
+force-exclude = '''
 /(
     thirdparty |
     \.eggs |
diff --git a/python/cudf/CMakeLists.txt b/python/cudf/CMakeLists.txt
index 49aeec41311..87ebcce1bc6 100644
--- a/python/cudf/CMakeLists.txt
+++ b/python/cudf/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-set(cudf_version 22.10.01)
+set(cudf_version 22.12.00)
 
 include(../../fetch_rapids.cmake)
 
@@ -31,14 +31,44 @@ project(
 option(FIND_CUDF_CPP "Search for existing CUDF C++ installations before defaulting to local files"
        OFF
 )
+option(CUDF_BUILD_WHEELS "Whether this build is generating a Python wheel." OFF)
+option(USE_LIBARROW_FROM_PYARROW "Use the libarrow contained within pyarrow." OFF)
+mark_as_advanced(USE_LIBARROW_FROM_PYARROW)
+
+# Always build wheels against the pyarrow libarrow.
+if(CUDF_BUILD_WHEELS)
+  set(USE_LIBARROW_FROM_PYARROW ON)
+endif()
 
 # If the user requested it we attempt to find CUDF.
 if(FIND_CUDF_CPP)
+  if(USE_LIBARROW_FROM_PYARROW)
+    # We need to find arrow before libcudf since libcudf requires it but doesn't bundle it. TODO:
+    # These options should probably all become optional since in practice they aren't meaningful
+    # except in the case where we actually compile Arrow.
+    set(CUDF_USE_ARROW_STATIC OFF)
+    set(CUDF_ENABLE_ARROW_S3 OFF)
+    set(CUDF_ENABLE_ARROW_ORC OFF)
+    set(CUDF_ENABLE_ARROW_PYTHON OFF)
+    set(CUDF_ENABLE_ARROW_PARQUET OFF)
+    include(rapids-find)
+    include(rapids-export)
+    include(../../cpp/cmake/thirdparty/get_arrow.cmake)
+  endif()
+
   find_package(cudf ${cudf_version} REQUIRED)
+
+  # an installed version of libcudf doesn't provide the dlpack headers so we need to download dlpack
+  # for the interop.pyx
+  include(rapids-cpm)
+  rapids_cpm_init()
+  include(../../cpp/cmake/thirdparty/get_dlpack.cmake)
 else()
   set(cudf_FOUND OFF)
 endif()
 
+include(rapids-cython)
+
 if(NOT cudf_FOUND)
   # TODO: This will not be necessary once we upgrade to CMake 3.22, which will pull in the required
   # languages for the C++ project even if this project does not require those languges.
@@ -51,15 +81,46 @@ if(NOT cudf_FOUND)
 
   set(BUILD_TESTS OFF)
   set(BUILD_BENCHMARKS OFF)
-  add_subdirectory(../../cpp cudf-cpp)
 
+  set(_exclude_from_all "")
+  if(CUDF_BUILD_WHEELS)
+    # We don't build C++ tests when building wheels, so we can also omit the test util and shrink
+    # the wheel by avoiding embedding GTest.
+    set(CUDF_BUILD_TESTUTIL OFF)
+
+    # Statically link cudart if building wheels
+    set(CUDA_STATIC_RUNTIME ON)
+
+    # Need to set this so all the nvcomp targets are global, not only nvcomp::nvcomp
+    # https://cmake.org/cmake/help/latest/variable/CMAKE_FIND_PACKAGE_TARGETS_GLOBAL.html#variable:CMAKE_FIND_PACKAGE_TARGETS_GLOBAL
+    set(CMAKE_FIND_PACKAGE_TARGETS_GLOBAL ON)
+
+    # Don't install the cuDF C++ targets into wheels
+    set(_exclude_from_all EXCLUDE_FROM_ALL)
+  endif()
+
+  add_subdirectory(../../cpp cudf-cpp ${_exclude_from_all})
+
+  if(CUDF_BUILD_WHEELS)
+    include(cmake/Modules/WheelHelpers.cmake)
+    get_target_property(_nvcomp_link_libs nvcomp::nvcomp INTERFACE_LINK_LIBRARIES)
+    # Ensure all the shared objects we need at runtime are in the wheel
+    add_target_libs_to_wheel(LIB_DIR cudf TARGETS arrow_shared nvcomp::nvcomp ${_nvcomp_link_libs})
+  endif()
   # Since there are multiple subpackages of cudf._lib that require access to libcudf, we place the
-  # library in the _lib/cpp directory as a single source of truth and modify the other rpaths
+  # library in the cudf directory as a single source of truth and modify the other rpaths
   # appropriately.
-  install(TARGETS cudf DESTINATION cudf/_lib/cpp)
+  set(cython_lib_dir cudf)
+  install(TARGETS cudf DESTINATION ${cython_lib_dir})
 endif()
 
-include(rapids-cython)
 rapids_cython_init()
 
 add_subdirectory(cudf/_lib)
+
+include(cmake/Modules/ProtobufHelpers.cmake)
+codegen_protoc(cudf/utils/metadata/orc_column_statistics.proto)
+
+if(DEFINED cython_lib_dir)
+  rapids_cython_add_rpath_entries(TARGET cudf PATHS "${cython_lib_dir}")
+endif()
diff --git a/python/cudf/LICENSE b/python/cudf/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/cudf/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/cudf/_custom_build/backend.py b/python/cudf/_custom_build/backend.py
new file mode 100644
index 00000000000..37b7edf2432
--- /dev/null
+++ b/python/cudf/_custom_build/backend.py
@@ -0,0 +1,37 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+"""Custom build backend for cudf to get versioned requirements.
+
+Based on https://setuptools.pypa.io/en/latest/build_meta.html
+"""
+import os
+from functools import wraps
+
+from setuptools import build_meta as _orig
+
+# Alias the required bits
+build_wheel = _orig.build_wheel
+build_sdist = _orig.build_sdist
+
+
+def replace_requirements(func):
+    @wraps(func)
+    def wrapper(config_settings=None):
+        orig_list = getattr(_orig, func.__name__)(config_settings)
+        append_list = [
+            f"rmm{os.getenv('RAPIDS_PY_WHEEL_CUDA_SUFFIX', default='')}"
+        ]
+        return orig_list + append_list
+
+    return wrapper
+
+
+get_requires_for_build_wheel = replace_requirements(
+    _orig.get_requires_for_build_wheel
+)
+get_requires_for_build_sdist = replace_requirements(
+    _orig.get_requires_for_build_sdist
+)
+get_requires_for_build_editable = replace_requirements(
+    _orig.get_requires_for_build_editable
+)
diff --git a/python/cudf/benchmarks/API/bench_dataframe.py b/python/cudf/benchmarks/API/bench_dataframe.py
index 9bad637f6ae..42bfa854396 100644
--- a/python/cudf/benchmarks/API/bench_dataframe.py
+++ b/python/cudf/benchmarks/API/bench_dataframe.py
@@ -41,14 +41,16 @@ def bench_merge(benchmark, dataframe, num_key_cols):
 @pytest.mark.parametrize(
     "values",
     [
-        range(1000),
-        {f"key{i}": range(1000) for i in range(10)},
-        cudf.DataFrame({f"key{i}": range(1000) for i in range(10)}),
-        cudf.Series(range(1000)),
+        lambda: range(50),
+        lambda: {f"{string.ascii_lowercase[i]}": range(50) for i in range(10)},
+        lambda: cudf.DataFrame(
+            {f"{string.ascii_lowercase[i]}": range(50) for i in range(10)}
+        ),
+        lambda: cudf.Series(range(50)),
     ],
 )
 def bench_isin(benchmark, dataframe, values):
-    benchmark(dataframe.isin, values)
+    benchmark(dataframe.isin, values())
 
 
 @pytest.fixture(
diff --git a/python/cudf/benchmarks/API/bench_rangeindex.py b/python/cudf/benchmarks/API/bench_rangeindex.py
index 7b2baef9081..42de5a86b65 100644
--- a/python/cudf/benchmarks/API/bench_rangeindex.py
+++ b/python/cudf/benchmarks/API/bench_rangeindex.py
@@ -40,3 +40,8 @@ def bench_min(benchmark, rangeindex):
 def bench_where(benchmark, rangeindex):
     cond = rangeindex % 2 == 0
     benchmark(rangeindex.where, cond, 0)
+
+
+def bench_isin(benchmark, rangeindex):
+    values = [10, 100]
+    benchmark(rangeindex.isin, values)
diff --git a/python/cudf/cmake/Modules/ProtobufHelpers.cmake b/python/cudf/cmake/Modules/ProtobufHelpers.cmake
new file mode 100644
index 00000000000..e3a0edf978e
--- /dev/null
+++ b/python/cudf/cmake/Modules/ProtobufHelpers.cmake
@@ -0,0 +1,51 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+include_guard(GLOBAL)
+
+# Compile protobuf files to Python. All arguments are assumed to be .proto files.
+function(codegen_protoc)
+  # Allow user to provide path to protoc executable as an environment variable.
+  if(DEFINED ENV{PROTOC})
+    set(protoc_COMMAND $ENV{PROTOC})
+  else()
+    find_program(protoc_COMMAND protoc REQUIRED)
+  endif()
+
+  foreach(_proto_path IN LISTS ARGV)
+    string(REPLACE "\.proto" "_pb2\.py" pb2_py_path "${_proto_path}")
+    set(pb2_py_path "${CMAKE_CURRENT_SOURCE_DIR}/${pb2_py_path}")
+    # Note: If we ever need to process larger numbers of protobuf files we should consider switching
+    # to protobuf_generate_python from the FindProtobuf module.
+    execute_process(
+      COMMAND ${protoc_COMMAND} --python_out=. "${_proto_path}"
+      WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR} COMMAND_ERROR_IS_FATAL ANY
+    )
+    # Mark entire file to skip formatting.
+    file(READ "${pb2_py_path}" pb2_py)
+    file(
+      WRITE "${pb2_py_path}"
+      [=[
+# flake8: noqa
+# fmt: off
+]=]
+    )
+    file(APPEND "${pb2_py_path}" "${pb2_py}")
+    file(
+      APPEND "${pb2_py_path}"
+      [=[
+# fmt: on
+]=]
+    )
+  endforeach()
+endfunction()
diff --git a/python/cudf/cmake/Modules/WheelHelpers.cmake b/python/cudf/cmake/Modules/WheelHelpers.cmake
new file mode 100644
index 00000000000..28ea33240fa
--- /dev/null
+++ b/python/cudf/cmake/Modules/WheelHelpers.cmake
@@ -0,0 +1,71 @@
+# =============================================================================
+# Copyright (c) 2022, NVIDIA CORPORATION.
+#
+# Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except
+# in compliance with the License. You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software distributed under the License
+# is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express
+# or implied. See the License for the specific language governing permissions and limitations under
+# the License.
+# =============================================================================
+include_guard(GLOBAL)
+
+# Making libraries available inside wheels by installing the associated targets.
+function(add_target_libs_to_wheel)
+  list(APPEND CMAKE_MESSAGE_CONTEXT "add_target_libs_to_wheel")
+
+  set(options "")
+  set(one_value "LIB_DIR")
+  set(multi_value "TARGETS")
+  cmake_parse_arguments(_ "${options}" "${one_value}" "${multi_value}" ${ARGN})
+
+  message(VERBOSE "Installing targets '${__TARGETS}' into lib_dir '${__LIB_DIR}'")
+
+  foreach(target IN LISTS __TARGETS)
+
+    if(NOT TARGET ${target})
+      message(VERBOSE "No target named ${target}")
+      continue()
+    endif()
+
+    get_target_property(alias_target ${target} ALIASED_TARGET)
+    if(alias_target)
+      set(target ${alias_target})
+    endif()
+
+    get_target_property(is_imported ${target} IMPORTED)
+    if(NOT is_imported)
+      # If the target isn't imported, install it into the the wheel
+      install(TARGETS ${target} DESTINATION ${__LIB_DIR})
+      message(VERBOSE "install(TARGETS ${target} DESTINATION ${__LIB_DIR})")
+    else()
+      # If the target is imported, make sure it's global
+      get_target_property(already_global ${target} IMPORTED_GLOBAL)
+      if(NOT already_global)
+        set_target_properties(${target} PROPERTIES IMPORTED_GLOBAL TRUE)
+      endif()
+
+      # Find the imported target's library so we can copy it into the wheel
+      set(lib_loc)
+      foreach(prop IN ITEMS IMPORTED_LOCATION IMPORTED_LOCATION_RELEASE IMPORTED_LOCATION_DEBUG)
+        get_target_property(lib_loc ${target} ${prop})
+        if(lib_loc)
+          message(VERBOSE "Found ${prop} for ${target}: ${lib_loc}")
+          break()
+        endif()
+        message(VERBOSE "${target} has no value for property ${prop}")
+      endforeach()
+
+      if(NOT lib_loc)
+        message(FATAL_ERROR "Found no libs to install for target ${target}")
+      endif()
+
+      # Copy the imported library into the wheel
+      install(FILES ${lib_loc} DESTINATION ${__LIB_DIR})
+      message(VERBOSE "install(FILES ${lib_loc} DESTINATION ${__LIB_DIR})")
+    endif()
+  endforeach()
+endfunction()
diff --git a/python/cudf/cudf/_fuzz_testing/tests/readme.md b/python/cudf/cudf/_fuzz_testing/tests/readme.md
index f3e02ad7ca5..f9ef1119a21 100644
--- a/python/cudf/cudf/_fuzz_testing/tests/readme.md
+++ b/python/cudf/cudf/_fuzz_testing/tests/readme.md
@@ -7,7 +7,7 @@ This directory contains all the Fuzz tests for cudf library.
 
 1. Add a Data Handler class which actually generates the necessary random data according to your requirements. This class should be added in `cudf/cudf/testing/`. A sample data handler class is: `CSVWriter`: https://github.com/rapidsai/cudf/blob/branch-0.16/python/cudf/cudf/testing/csv.py
 2. Data Handlers are registered by the `pythonfuzz` decorator. At runtime, the Fuzzer will continuously run registered fuzz tests.
-  
+
 ```python
 from cudf.testing.csv import CSVWriter
 
@@ -37,7 +37,7 @@ python write_csv.py csv_writer_test
 
 ## Tips to run specific crash file/files
 
-Using the `pythonfuzz` decorator pass in `regression=True` with `dirs` having list of directories 
+Using the `pythonfuzz` decorator pass in `regression=True` with `dirs` having list of directories
 ```python
 @pythonfuzz(data_handle=CSVWriter, regression=True, dir=["/cudf/python/cudf/cudf/_fuzz_testing"])
 ```
@@ -52,7 +52,7 @@ and passed to the `your_custom_fuzz_test`.
 If a parameter value depends the kind of input generated by the `data_handle`(in this case `CSVReader`),
 then you can assign `ALL_POSSIBLE_VALUES` constant to it. This constant is used as an identifier by the
 `data_handle` to generate random parameter values for that specific parameter purely based on data.
-To perform this customization `set_rand_params` should be implemented as shown in the below example. 
+To perform this customization `set_rand_params` should be implemented as shown in the below example.
 ```python
 from cudf._fuzz_testing.main import pythonfuzz
 from cudf._fuzz_testing.utils import ALL_POSSIBLE_VALUES
@@ -97,4 +97,4 @@ def set_rand_params(self, params):
         else:
             params_dict[param] = np.random.choice(values)
     self._current_params["test_kwargs"] = self.process_kwargs(params_dict)
-```
\ No newline at end of file
+```
diff --git a/python/cudf/cudf/_lib/CMakeLists.txt b/python/cudf/cudf/_lib/CMakeLists.txt
index 1f6b2069b49..0b72298b51e 100644
--- a/python/cudf/cudf/_lib/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/CMakeLists.txt
@@ -54,23 +54,39 @@ set(cython_sources
     utils.pyx
 )
 set(linked_libraries cudf::cudf)
+
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}"
+  LINKED_LIBRARIES "${linked_libraries}" ASSOCIATED_TARGETS cudf
 )
 
 # TODO: Finding NumPy currently requires finding Development due to a bug in CMake. This bug was
 # fixed in https://gitlab.kitware.com/cmake/cmake/-/merge_requests/7410 and will be available in
 # CMake 3.24, so we can remove the Development component once we upgrade to CMake 3.24.
-find_package(Python REQUIRED COMPONENTS Development NumPy)
+# find_package(Python REQUIRED COMPONENTS Development NumPy)
+
+# Note: The bug noted above prevents us from finding NumPy successfully using FindPython.cmake
+# inside the manylinux images used to build wheels because manylinux images do not contain
+# libpython.so and therefore Development cannot be found. Until we upgrade to CMake 3.24, we should
+# use FindNumpy.cmake instead (provided by scikit-build). When we switch to 3.24 we can try
+# switching back, but it may not work if that implicitly still requires Python libraries. In that
+# case we'll need to follow up with the CMake team to remove that dependency.  The stopgap solution
+# is to unpack the static lib tarballs in the wheel building jobs so that there are at least static
+# libs to be found, but that should be a last resort since it implies a dependency that isn't really
+# necessary. The relevant command is tar -xf /opt/_internal/static-libs-for-embedding-only.tar.xz -C
+# /opt/_internal"
+find_package(NumPy REQUIRED)
 set(targets_using_numpy interop avro csv orc json parquet)
 foreach(target IN LISTS targets_using_numpy)
-  target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
+  # Switch to the line below when we switch back to FindPython.cmake in CMake 3.24.
+  # target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
 endforeach()
 
-foreach(cython_module IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-  set_target_properties(${cython_module} PROPERTIES INSTALL_RPATH "\$ORIGIN;\$ORIGIN/cpp")
+set(targets_using_dlpack interop)
+foreach(target IN LISTS targets_using_dlpack)
+  target_include_directories(${target} PRIVATE "${DLPACK_INCLUDE_DIR}")
 endforeach()
 
 add_subdirectory(io)
diff --git a/python/cudf/cudf/_lib/aggregation.pyx b/python/cudf/cudf/_lib/aggregation.pyx
index 84dd9c3a576..72c5e288f0b 100644
--- a/python/cudf/cudf/_lib/aggregation.pyx
+++ b/python/cudf/cudf/_lib/aggregation.pyx
@@ -2,20 +2,13 @@
 
 from enum import Enum, IntEnum
 
-import numba
-import numpy as np
 import pandas as pd
 
-from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from cudf._lib.types import (
-    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
-    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
-    NullHandling,
-)
+from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES, NullHandling
 from cudf.utils import cudautils
 
 from cudf._lib.types cimport (
diff --git a/python/cudf/cudf/_lib/avro.pyx b/python/cudf/cudf/_lib/avro.pyx
index b6e23e7c3a0..0c8886ca356 100644
--- a/python/cudf/cudf/_lib/avro.pyx
+++ b/python/cudf/cudf/_lib/avro.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -16,7 +16,7 @@ from cudf._lib.utils cimport data_from_unique_ptr
 
 cpdef read_avro(datasource, columns=None, skip_rows=-1, num_rows=-1):
     """
-    Cython function to call libcudf++ read_avro, see `read_avro`.
+    Cython function to call libcudf read_avro, see `read_avro`.
 
     See Also
     --------
diff --git a/python/cudf/cudf/_lib/binaryop.pyx b/python/cudf/cudf/_lib/binaryop.pyx
index 8728437541d..6212347b5b1 100644
--- a/python/cudf/cudf/_lib/binaryop.pyx
+++ b/python/cudf/cudf/_lib/binaryop.pyx
@@ -2,8 +2,6 @@
 
 from enum import IntEnum
 
-import numpy as np
-
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -11,7 +9,6 @@ from libcpp.utility cimport move
 from cudf._lib.binaryop cimport underlying_type_t_binary_operator
 from cudf._lib.column cimport Column
 
-from cudf._lib.replace import replace_nulls
 from cudf._lib.scalar import as_device_scalar
 
 from cudf._lib.scalar cimport DeviceScalar
@@ -24,7 +21,8 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.types cimport data_type, type_id
 from cudf._lib.types cimport dtype_to_data_type, underlying_type_t_type_id
 
-from cudf.api.types import is_scalar, is_string_dtype
+from cudf.api.types import is_scalar
+from cudf.core.buffer import acquire_spill_lock
 
 cimport cudf._lib.cpp.binaryop as cpp_binaryop
 from cudf._lib.cpp.binaryop cimport binary_operator
@@ -159,6 +157,7 @@ cdef binaryop_s_v(DeviceScalar lhs, Column rhs,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def binaryop(lhs, rhs, op, dtype):
     """
     Dispatches a binary op call to the appropriate libcudf function:
@@ -176,7 +175,6 @@ def binaryop(lhs, rhs, op, dtype):
     cdef data_type c_dtype = dtype_to_data_type(dtype)
 
     if is_scalar(lhs) or lhs is None:
-        is_string_col = is_string_dtype(rhs.dtype)
         s_lhs = as_device_scalar(lhs, dtype=rhs.dtype if lhs is None else None)
         result = binaryop_s_v(
             s_lhs,
@@ -186,7 +184,6 @@ def binaryop(lhs, rhs, op, dtype):
         )
 
     elif is_scalar(rhs) or rhs is None:
-        is_string_col = is_string_dtype(lhs.dtype)
         s_rhs = as_device_scalar(rhs, dtype=lhs.dtype if rhs is None else None)
         result = binaryop_v_s(
             lhs,
@@ -196,7 +193,6 @@ def binaryop(lhs, rhs, op, dtype):
         )
 
     else:
-        is_string_col = is_string_dtype(lhs.dtype)
         result = binaryop_v_v(
             lhs,
             rhs,
@@ -206,6 +202,7 @@ def binaryop(lhs, rhs, op, dtype):
     return result
 
 
+@acquire_spill_lock()
 def binaryop_udf(Column lhs, Column rhs, udf_ptx, dtype):
     """
     Apply a user-defined binary operator (a UDF) defined in `udf_ptx` on
diff --git a/python/cudf/cudf/_lib/column.pxd b/python/cudf/cudf/_lib/column.pxd
index 2df958466c6..f8f851bfe0f 100644
--- a/python/cudf/cudf/_lib/column.pxd
+++ b/python/cudf/cudf/_lib/column.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -28,7 +28,9 @@ cdef class Column:
     cdef mutable_column_view mutable_view(self) except *
 
     @staticmethod
-    cdef Column from_unique_ptr(unique_ptr[column] c_col)
+    cdef Column from_unique_ptr(
+        unique_ptr[column] c_col, bint data_ptr_exposed=*
+    )
 
     @staticmethod
     cdef Column from_column_view(column_view, object)
diff --git a/python/cudf/cudf/_lib/column.pyi b/python/cudf/cudf/_lib/column.pyi
index fd9aab038d4..612f3cdf95a 100644
--- a/python/cudf/cudf/_lib/column.pyi
+++ b/python/cudf/cudf/_lib/column.pyi
@@ -5,16 +5,16 @@ from __future__ import annotations
 from typing import Dict, Optional, Tuple, TypeVar
 
 from cudf._typing import Dtype, DtypeObj, ScalarLike
-from cudf.core.buffer import DeviceBufferLike
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase
 
 T = TypeVar("T")
 
 class Column:
-    _data: Optional[DeviceBufferLike]
-    _mask: Optional[DeviceBufferLike]
-    _base_data: Optional[DeviceBufferLike]
-    _base_mask: Optional[DeviceBufferLike]
+    _data: Optional[Buffer]
+    _mask: Optional[Buffer]
+    _base_data: Optional[Buffer]
+    _base_mask: Optional[Buffer]
     _dtype: DtypeObj
     _size: int
     _offset: int
@@ -25,10 +25,10 @@ class Column:
 
     def __init__(
         self,
-        data: Optional[DeviceBufferLike],
+        data: Optional[Buffer],
         size: int,
         dtype: Dtype,
-        mask: Optional[DeviceBufferLike] = None,
+        mask: Optional[Buffer] = None,
         offset: int = None,
         null_count: int = None,
         children: Tuple[ColumnBase, ...] = (),
@@ -40,27 +40,25 @@ class Column:
     @property
     def size(self) -> int: ...
     @property
-    def base_data(self) -> Optional[DeviceBufferLike]: ...
+    def base_data(self) -> Optional[Buffer]: ...
     @property
-    def base_data_ptr(self) -> int: ...
-    @property
-    def data(self) -> Optional[DeviceBufferLike]: ...
+    def data(self) -> Optional[Buffer]: ...
     @property
     def data_ptr(self) -> int: ...
-    def set_base_data(self, value: DeviceBufferLike) -> None: ...
+    def set_base_data(self, value: Buffer) -> None: ...
     @property
     def nullable(self) -> bool: ...
     def has_nulls(self, include_nan: bool = False) -> bool: ...
     @property
-    def base_mask(self) -> Optional[DeviceBufferLike]: ...
+    def base_mask(self) -> Optional[Buffer]: ...
     @property
     def base_mask_ptr(self) -> int: ...
     @property
-    def mask(self) -> Optional[DeviceBufferLike]: ...
+    def mask(self) -> Optional[Buffer]: ...
     @property
     def mask_ptr(self) -> int: ...
-    def set_base_mask(self, value: Optional[DeviceBufferLike]) -> None: ...
-    def set_mask(self: T, value: Optional[DeviceBufferLike]) -> T: ...
+    def set_base_mask(self, value: Optional[Buffer]) -> None: ...
+    def set_mask(self: T, value: Optional[Buffer]) -> T: ...
     @property
     def null_count(self) -> int: ...
     @property
diff --git a/python/cudf/cudf/_lib/column.pyx b/python/cudf/cudf/_lib/column.pyx
index 78125c027dd..ec7d2570708 100644
--- a/python/cudf/cudf/_lib/column.pyx
+++ b/python/cudf/cudf/_lib/column.pyx
@@ -2,39 +2,30 @@
 
 import cupy as cp
 import numpy as np
-import pandas as pd
 
 import rmm
 
 import cudf
 import cudf._lib as libcudf
-from cudf.api.types import is_categorical_dtype, is_list_dtype, is_struct_dtype
-from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like
+from cudf.api.types import is_categorical_dtype
+from cudf.core.buffer import (
+    Buffer,
+    SpillableBuffer,
+    SpillLock,
+    acquire_spill_lock,
+    as_buffer,
+    get_spill_lock,
+)
 
 from cpython.buffer cimport PyObject_CheckBuffer
 from libc.stdint cimport uintptr_t
-from libcpp cimport bool
 from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from rmm._lib.device_buffer cimport DeviceBuffer
 
-from cudf._lib.cpp.strings.convert.convert_integers cimport (
-    from_integers as cpp_from_integers,
-)
-
-from cudf._lib.types import (
-    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
-    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
-)
-
-from cudf._lib.types cimport (
-    dtype_from_column_view,
-    dtype_to_data_type,
-    underlying_type_t_type_id,
-)
+from cudf._lib.types cimport dtype_from_column_view, dtype_to_data_type
 
 from cudf._lib.null_mask import bitmask_allocation_size_bytes
 
@@ -46,7 +37,6 @@ from cudf._lib.cpp.column.column_factories cimport (
     make_numeric_column,
 )
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
 
@@ -56,9 +46,9 @@ cdef class Column:
     A Column stores columnar data in device memory.
     A Column may be composed of:
 
-    * A *data* DeviceBufferLike
+    * A *data* Buffer
     * One or more (optional) *children* Columns
-    * An (optional) *mask* DeviceBufferLike representing the nullmask
+    * An (optional) *mask* Buffer representing the nullmask
 
     The *dtype* indicates the Column's element type.
     """
@@ -98,13 +88,6 @@ cdef class Column:
     def base_data(self):
         return self._base_data
 
-    @property
-    def base_data_ptr(self):
-        if self.base_data is None:
-            return 0
-        else:
-            return self.base_data.ptr
-
     @property
     def data(self):
         if self.base_data is None:
@@ -123,9 +106,9 @@ cdef class Column:
             return self.data.ptr
 
     def set_base_data(self, value):
-        if value is not None and not isinstance(value, DeviceBufferLike):
+        if value is not None and not isinstance(value, Buffer):
             raise TypeError(
-                "Expected a DeviceBufferLike or None for data, "
+                "Expected a Buffer or None for data, "
                 f"got {type(value).__name__}"
             )
 
@@ -172,9 +155,9 @@ cdef class Column:
         modify size or offset in any way, so the passed mask is expected to be
         compatible with the current offset.
         """
-        if value is not None and not isinstance(value, DeviceBufferLike):
+        if value is not None and not isinstance(value, Buffer):
             raise TypeError(
-                "Expected a DeviceBufferLike or None for mask, "
+                "Expected a Buffer or None for mask, "
                 f"got {type(value).__name__}"
             )
 
@@ -182,7 +165,7 @@ cdef class Column:
             required_size = bitmask_allocation_size_bytes(self.base_size)
             if value.size < required_size:
                 error_msg = (
-                    "The DeviceBufferLike for mask is smaller than expected, "
+                    "The Buffer for mask is smaller than expected, "
                     f"got {value.size} bytes, expected {required_size} bytes."
                 )
                 if self.offset > 0 or self.size < self.base_size:
@@ -227,30 +210,30 @@ cdef class Column:
                 if isinstance(value, Column):
                     value = value.data_array_view
                 value = cp.asarray(value).view('|u1')
-            mask = as_device_buffer_like(value)
+            mask = as_buffer(value)
             if mask.size < required_num_bytes:
                 raise ValueError(error_msg.format(str(value.size)))
             if mask.size < mask_size:
                 dbuf = rmm.DeviceBuffer(size=mask_size)
                 dbuf.copy_from_device(value)
-                mask = as_device_buffer_like(dbuf)
+                mask = as_buffer(dbuf)
         elif hasattr(value, "__array_interface__"):
             value = np.asarray(value).view("u1")[:mask_size]
             if value.size < required_num_bytes:
                 raise ValueError(error_msg.format(str(value.size)))
             dbuf = rmm.DeviceBuffer(size=mask_size)
             dbuf.copy_from_host(value)
-            mask = as_device_buffer_like(dbuf)
+            mask = as_buffer(dbuf)
         elif PyObject_CheckBuffer(value):
             value = np.asarray(value).view("u1")[:mask_size]
             if value.size < required_num_bytes:
                 raise ValueError(error_msg.format(str(value.size)))
             dbuf = rmm.DeviceBuffer(size=mask_size)
             dbuf.copy_from_host(value)
-            mask = as_device_buffer_like(dbuf)
+            mask = as_buffer(dbuf)
         else:
             raise TypeError(
-                "Expected a DeviceBufferLike object or None for mask, "
+                "Expected a Buffer object or None for mask, "
                 f"got {type(value).__name__}"
             )
 
@@ -331,7 +314,8 @@ cdef class Column:
             return other_col
 
     cdef libcudf_types.size_type compute_null_count(self) except? 0:
-        return self._view(libcudf_types.UNKNOWN_NULL_COUNT).null_count()
+        with acquire_spill_lock():
+            return self._view(libcudf_types.UNKNOWN_NULL_COUNT).null_count()
 
     cdef mutable_column_view mutable_view(self) except *:
         if is_categorical_dtype(self.dtype):
@@ -345,7 +329,14 @@ cdef class Column:
         cdef vector[mutable_column_view] children
         cdef void* data
 
-        data = <void*><uintptr_t>(col.base_data_ptr)
+        if col.base_data is None:
+            data = NULL
+        elif isinstance(col.base_data, SpillableBuffer):
+            data = <void*><uintptr_t>(col.base_data).get_ptr(
+                spill_lock=get_spill_lock()
+            )
+        else:
+            data = <void*><uintptr_t>(col.base_data.ptr)
 
         cdef Column child_column
         if col.base_children:
@@ -398,7 +389,14 @@ cdef class Column:
         cdef vector[column_view] children
         cdef void* data
 
-        data = <void*><uintptr_t>(col.base_data_ptr)
+        if col.base_data is None:
+            data = NULL
+        elif isinstance(col.base_data, SpillableBuffer):
+            data = <void*><uintptr_t>(col.base_data).get_ptr(
+                spill_lock=get_spill_lock()
+            )
+        else:
+            data = <void*><uintptr_t>(col.base_data.ptr)
 
         cdef Column child_column
         if col.base_children:
@@ -423,7 +421,16 @@ cdef class Column:
             children)
 
     @staticmethod
-    cdef Column from_unique_ptr(unique_ptr[column] c_col):
+    cdef Column from_unique_ptr(
+        unique_ptr[column] c_col, bint data_ptr_exposed=False
+    ):
+        """Create a Column from a column
+
+        Typically, this is called on the result of a libcudf operation.
+        If the data of the libcudf result has been exposed, set
+        `data_ptr_exposed=True` to expose the memory of the returned Column
+        as well.
+        """
         cdef column_view view = c_col.get()[0].view()
         cdef libcudf_types.type_id tid = view.type().id()
         cdef libcudf_types.data_type c_dtype
@@ -448,20 +455,30 @@ cdef class Column:
         # After call to release(), c_col is unusable
         cdef column_contents contents = move(c_col.get()[0].release())
 
-        data = DeviceBuffer.c_from_unique_ptr(move(contents.data))
-        data = as_device_buffer_like(data)
+        data = as_buffer(
+            DeviceBuffer.c_from_unique_ptr(move(contents.data)),
+            exposed=data_ptr_exposed
+        )
 
         if null_count > 0:
-            mask = DeviceBuffer.c_from_unique_ptr(move(contents.null_mask))
-            mask = as_device_buffer_like(mask)
+            mask = as_buffer(
+                DeviceBuffer.c_from_unique_ptr(move(contents.null_mask)),
+                exposed=data_ptr_exposed
+            )
         else:
             mask = None
 
         cdef vector[unique_ptr[column]] c_children = move(contents.children)
-        children = ()
+        children = []
         if c_children.size() != 0:
-            children = tuple(Column.from_unique_ptr(move(c_children[i]))
-                             for i in range(c_children.size()))
+            # Because of a bug in Cython, we cannot set the optional
+            # `data_ptr_exposed` argument within a comprehension.
+            for i in range(c_children.size()):
+                child = Column.from_unique_ptr(
+                    move(c_children[i]),
+                    data_ptr_exposed=data_ptr_exposed
+                )
+                children.append(child)
 
         return cudf.core.column.build_column(
             data,
@@ -469,7 +486,7 @@ cdef class Column:
             mask=mask,
             size=size,
             null_count=null_count,
-            children=children
+            children=tuple(children)
         )
 
     @staticmethod
@@ -478,8 +495,8 @@ cdef class Column:
         Given a ``cudf::column_view``, constructs a ``cudf.Column`` from it,
         along with referencing an ``owner`` Python object that owns the memory
         lifetime. If ``owner`` is a ``cudf.Column``, we reach inside of it and
-        make the owner of each newly created ``DeviceBufferLike`` the
-        respective ``DeviceBufferLike`` from the ``owner`` ``cudf.Column``.
+        make the owner of each newly created ``Buffer`` the respective
+        ``Buffer`` from the ``owner`` ``cudf.Column``.
         If ``owner`` is ``None``, we allocate new memory for the resulting
         ``cudf.Column``.
         """
@@ -491,6 +508,7 @@ cdef class Column:
         size = cv.size()
         offset = cv.offset()
         dtype = dtype_from_column_view(cv)
+        dtype_itemsize = getattr(dtype, "itemsize", 1)
 
         data_ptr = <uintptr_t>(cv.head[void]())
         data = None
@@ -501,21 +519,48 @@ cdef class Column:
             data_owner = owner.base_data
             mask_owner = mask_owner.base_mask
             base_size = owner.base_size
-
+        base_nbytes = base_size * dtype_itemsize
         if data_ptr:
             if data_owner is None:
-                data = as_device_buffer_like(
+                data = as_buffer(
                     rmm.DeviceBuffer(ptr=data_ptr,
-                                     size=(size+offset) * dtype.itemsize)
+                                     size=(size+offset) * dtype_itemsize)
                 )
+            elif (
+                # This is an optimization of the most common case where
+                # from_column_view creates a "view" that is identical to
+                # the owner.
+                column_owner and
+                isinstance(data_owner, SpillableBuffer) and
+                # We check that `data_owner` is spill locked (not spillable)
+                # and that its pointer is the same as `data_ptr` _without_
+                # exposing the buffer permanently (calling get_ptr with a
+                # dummy SpillLock).
+                not data_owner.spillable and
+                data_owner.get_ptr(spill_lock=SpillLock()) == data_ptr and
+                data_owner.size == base_nbytes
+            ):
+                data = data_owner
             else:
-                data = Buffer(
+                # At this point we don't know the relationship between data_ptr
+                # and data_owner thus we mark both of them exposed.
+                # TODO: try to discover their relationship and create a
+                #       SpillableBufferSlice instead.
+                data = as_buffer(
                     data=data_ptr,
-                    size=(base_size) * dtype.itemsize,
-                    owner=data_owner
+                    size=base_nbytes,
+                    owner=data_owner,
+                    exposed=True,
                 )
+                if isinstance(data_owner, SpillableBuffer):
+                    if data_owner.is_spilled:
+                        raise ValueError(
+                            f"{data_owner} is spilled, which invalidates "
+                            f"the exposed data_ptr ({hex(data_ptr)})"
+                        )
+                    data_owner.ptr  # accessing the pointer marks it exposed.
         else:
-            data = as_device_buffer_like(
+            data = as_buffer(
                 rmm.DeviceBuffer(ptr=data_ptr, size=0)
             )
 
@@ -545,17 +590,18 @@ cdef class Column:
                     # result:
                     mask = None
                 else:
-                    mask = as_device_buffer_like(
+                    mask = as_buffer(
                         rmm.DeviceBuffer(
                             ptr=mask_ptr,
                             size=bitmask_allocation_size_bytes(size+offset)
                         )
                     )
             else:
-                mask = Buffer(
+                mask = as_buffer(
                     data=mask_ptr,
                     size=bitmask_allocation_size_bytes(base_size),
-                    owner=mask_owner
+                    owner=mask_owner,
+                    exposed=True
                 )
 
         if cv.has_nulls():
diff --git a/python/cudf/cudf/_lib/concat.pyx b/python/cudf/cudf/_lib/concat.pyx
index ed858034032..75e2d3bfbdc 100644
--- a/python/cudf/cudf/_lib/concat.pyx
+++ b/python/cudf/cudf/_lib/concat.pyx
@@ -19,7 +19,7 @@ from cudf._lib.utils cimport (
     table_view_from_table,
 )
 
-from cudf.core.buffer import as_device_buffer_like
+from cudf.core.buffer import as_buffer
 
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
@@ -31,7 +31,7 @@ cpdef concat_masks(object columns):
     with nogil:
         c_result = move(libcudf_concatenate_masks(c_views))
         c_unique_result = make_unique[device_buffer](move(c_result))
-    return as_device_buffer_like(
+    return as_buffer(
         DeviceBuffer.c_from_unique_ptr(move(c_unique_result))
     )
 
diff --git a/python/cudf/cudf/_lib/copying.pyx b/python/cudf/cudf/_lib/copying.pyx
index f1183e008f8..9f0b294b10c 100644
--- a/python/cudf/cudf/_lib/copying.pyx
+++ b/python/cudf/cudf/_lib/copying.pyx
@@ -3,9 +3,7 @@
 import pickle
 import warnings
 
-import pandas as pd
-
-from libc.stdint cimport int32_t, int64_t, uint8_t, uintptr_t
+from libc.stdint cimport int32_t, uint8_t, uintptr_t
 from libcpp cimport bool
 from libcpp.memory cimport make_shared, make_unique, shared_ptr, unique_ptr
 from libcpp.utility cimport move
@@ -14,7 +12,7 @@ from libcpp.vector cimport vector
 from rmm._lib.device_buffer cimport DeviceBuffer
 
 import cudf
-from cudf.core.buffer import Buffer
+from cudf.core.buffer import Buffer, acquire_spill_lock, as_buffer
 
 from cudf._lib.column cimport Column
 
@@ -42,7 +40,6 @@ from cudf._lib.utils cimport (
     columns_from_table_view,
     columns_from_unique_ptr,
     data_from_table_view,
-    data_from_unique_ptr,
     table_view_from_columns,
 )
 
@@ -67,6 +64,7 @@ def _gather_map_is_valid(
     return gm_min >= -nrows and gm_max < nrows
 
 
+@acquire_spill_lock()
 def copy_column(Column input_column):
     """
     Deep copies a column
@@ -88,6 +86,7 @@ def copy_column(Column input_column):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def _copy_range_in_place(Column input_column,
                          Column target_column,
                          size_type input_begin,
@@ -135,38 +134,41 @@ def _copy_range(Column input_column,
     return Column.from_unique_ptr(move(c_result))
 
 
-def copy_range(Column input_column,
+@acquire_spill_lock()
+def copy_range(Column source_column,
                Column target_column,
-               size_type input_begin,
-               size_type input_end,
+               size_type source_begin,
+               size_type source_end,
                size_type target_begin,
                size_type target_end,
                bool inplace):
     """
-    Copy input_column from input_begin to input_end to
-    target_column from target_begin to target_end
-    """
-
-    if abs(target_end - target_begin) <= 1:
-        return target_column
-
-    if target_begin < 0:
-        target_begin = target_begin + target_column.size
+    Copy a contiguous range from a source to a target column
 
-    if target_end < 0:
-        target_end = target_end + target_column.size
+    Notes
+    -----
+    Expects the source and target ranges to have been sanitised to be
+    in-range for the source and target column respectively. For
+    example via ``slice.indices``.
+    """
 
-    if target_begin > target_end:
+    assert (
+        source_end - source_begin == target_end - target_begin,
+        "Source and target ranges must be same length"
+    )
+    if target_end >= target_begin and inplace:
+        # FIXME: Are we allowed to do this when inplace=False?
         return target_column
 
-    if inplace is True:
-        _copy_range_in_place(input_column, target_column,
-                             input_begin, input_end, target_begin)
+    if inplace:
+        _copy_range_in_place(source_column, target_column,
+                             source_begin, source_end, target_begin)
     else:
-        return _copy_range(input_column, target_column,
-                           input_begin, input_end, target_begin)
+        return _copy_range(source_column, target_column,
+                           source_begin, source_end, target_begin)
 
 
+@acquire_spill_lock()
 def gather(
     list columns,
     Column gather_map,
@@ -194,8 +196,7 @@ def gather(
 
 cdef scatter_scalar(list source_device_slrs,
                     column_view scatter_map,
-                    table_view target_table,
-                    bool bounds_check):
+                    table_view target_table):
     cdef vector[reference_wrapper[constscalar]] c_source
     cdef DeviceScalar d_slr
     cdef unique_ptr[table] c_result
@@ -212,7 +213,6 @@ cdef scatter_scalar(list source_device_slrs,
                 c_source,
                 scatter_map,
                 target_table,
-                bounds_check
             )
         )
 
@@ -221,8 +221,7 @@ cdef scatter_scalar(list source_device_slrs,
 
 cdef scatter_column(list source_columns,
                     column_view scatter_map,
-                    table_view target_table,
-                    bool bounds_check):
+                    table_view target_table):
     cdef table_view c_source = table_view_from_columns(source_columns)
     cdef unique_ptr[table] c_result
 
@@ -232,12 +231,12 @@ cdef scatter_column(list source_columns,
                 c_source,
                 scatter_map,
                 target_table,
-                bounds_check
             )
         )
     return columns_from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def scatter(list sources, Column scatter_map, list target_columns,
             bool bounds_check=True):
     """
@@ -257,17 +256,28 @@ def scatter(list sources, Column scatter_map, list target_columns,
     cdef column_view scatter_map_view = scatter_map.view()
     cdef table_view target_table_view = table_view_from_columns(target_columns)
 
+    if bounds_check:
+        n_rows = len(target_columns[0])
+        if not (
+            (scatter_map >= -n_rows).all()
+            and (scatter_map < n_rows).all()
+        ):
+            raise IndexError(
+                f"index out of bounds for column of size {n_rows}"
+            )
+
     if isinstance(sources[0], Column):
         return scatter_column(
-            sources, scatter_map_view, target_table_view, bounds_check
+            sources, scatter_map_view, target_table_view
         )
     else:
         source_scalars = [as_device_scalar(slr) for slr in sources]
         return scatter_scalar(
-            source_scalars, scatter_map_view, target_table_view, bounds_check
+            source_scalars, scatter_map_view, target_table_view
         )
 
 
+@acquire_spill_lock()
 def column_empty_like(Column input_column):
 
     cdef column_view input_column_view = input_column.view()
@@ -279,6 +289,7 @@ def column_empty_like(Column input_column):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def column_allocate_like(Column input_column, size=None):
 
     cdef size_type c_size = 0
@@ -303,6 +314,7 @@ def column_allocate_like(Column input_column, size=None):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def columns_empty_like(list input_columns):
     cdef table_view input_table_view = table_view_from_columns(input_columns)
     cdef unique_ptr[table] c_result
@@ -313,6 +325,7 @@ def columns_empty_like(list input_columns):
     return columns_from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def column_slice(Column input_column, object indices):
 
     cdef column_view input_column_view = input_column.view()
@@ -342,6 +355,7 @@ def column_slice(Column input_column, object indices):
     return result
 
 
+@acquire_spill_lock()
 def columns_slice(list input_columns, list indices):
     """
     Given a list of input columns, return columns sliced by ``indices``.
@@ -368,6 +382,7 @@ def columns_slice(list input_columns, list indices):
     ]
 
 
+@acquire_spill_lock()
 def column_split(Column input_column, object splits):
 
     cdef column_view input_column_view = input_column.view()
@@ -399,6 +414,7 @@ def column_split(Column input_column, object splits):
     return result
 
 
+@acquire_spill_lock()
 def columns_split(list input_columns, object splits):
 
     cdef table_view input_table_view = table_view_from_columns(input_columns)
@@ -505,6 +521,7 @@ def _copy_if_else_scalar_scalar(DeviceScalar lhs,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def copy_if_else(object lhs, object rhs, Column boolean_mask):
 
     if isinstance(lhs, Column):
@@ -572,6 +589,7 @@ def _boolean_mask_scatter_scalar(list input_scalars, list target_columns,
     return columns_from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def boolean_mask_scatter(list input_, list target_columns,
                          Column boolean_mask):
     """Copy the target columns, replacing masked rows with input data.
@@ -604,6 +622,7 @@ def boolean_mask_scatter(list input_, list target_columns,
         )
 
 
+@acquire_spill_lock()
 def shift(Column input, int offset, object fill_value=None):
 
     cdef DeviceScalar fill
@@ -640,6 +659,7 @@ def shift(Column input, int offset, object fill_value=None):
     return Column.from_unique_ptr(move(c_output))
 
 
+@acquire_spill_lock()
 def get_element(Column input_column, size_type index):
     cdef column_view col_view = input_column.view()
 
@@ -654,6 +674,7 @@ def get_element(Column input_column, size_type index):
     )
 
 
+@acquire_spill_lock()
 def segmented_gather(Column source_column, Column gather_map):
     cdef shared_ptr[lists_column_view] source_LCV = (
         make_shared[lists_column_view](source_column.view())
@@ -718,10 +739,11 @@ cdef class _CPackedColumns:
         header = {}
         frames = []
 
-        gpu_data = Buffer(
+        gpu_data = as_buffer(
             data=self.gpu_data_ptr,
             size=self.gpu_data_size,
-            owner=self
+            owner=self,
+            exposed=True
         )
         data_header, data_frames = gpu_data.serialize()
         header["data"] = data_header
diff --git a/python/cudf/cudf/_lib/cpp/copying.pxd b/python/cudf/cudf/_lib/cpp/copying.pxd
index a1c433774b5..bc89d364004 100644
--- a/python/cudf/cudf/_lib/cpp/copying.pxd
+++ b/python/cudf/cudf/_lib/cpp/copying.pxd
@@ -44,14 +44,12 @@ cdef extern from "cudf/copying.hpp" namespace "cudf" nogil:
         table_view source_table,
         column_view scatter_map,
         table_view target_table,
-        bool bounds_check
     ) except +
 
     cdef unique_ptr[table] scatter (
         vector[reference_wrapper[constscalar]] source_scalars,
         column_view indices,
         table_view target,
-        bool bounds_check
     ) except +
 
     ctypedef enum mask_allocation_policy:
diff --git a/python/cudf/cudf/_lib/cpp/datetime.pxd b/python/cudf/cudf/_lib/cpp/datetime.pxd
index 74addb87357..d03587745e1 100644
--- a/python/cudf/cudf/_lib/cpp/datetime.pxd
+++ b/python/cudf/cudf/_lib/cpp/datetime.pxd
@@ -15,6 +15,15 @@ cdef extern from "cudf/datetime.hpp" namespace "cudf::datetime" nogil:
     cdef unique_ptr[column] extract_hour(const column_view& column) except +
     cdef unique_ptr[column] extract_minute(const column_view& column) except +
     cdef unique_ptr[column] extract_second(const column_view& column) except +
+    cdef unique_ptr[column] extract_millisecond_fraction(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] extract_microsecond_fraction(
+        const column_view& column
+    ) except +
+    cdef unique_ptr[column] extract_nanosecond_fraction(
+        const column_view& column
+    ) except +
 
     ctypedef enum rounding_frequency "cudf::datetime::rounding_frequency":
         DAY "cudf::datetime::rounding_frequency::DAY"
diff --git a/python/cudf/cudf/_lib/cpp/filling.pxd b/python/cudf/cudf/_lib/cpp/filling.pxd
index 4233ab60ff2..e412f294537 100644
--- a/python/cudf/cudf/_lib/cpp/filling.pxd
+++ b/python/cudf/cudf/_lib/cpp/filling.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -29,7 +29,6 @@ cdef extern from "cudf/filling.hpp" namespace "cudf" nogil:
     cdef unique_ptr[table] repeat(
         const table_view & input,
         const column_view & count,
-        bool check_count
     ) except +
 
     cdef unique_ptr[table] repeat(
diff --git a/python/cudf/cudf/_lib/cpp/io/avro.pxd b/python/cudf/cudf/_lib/cpp/io/avro.pxd
index 6efe42e5208..9b683e5bce3 100644
--- a/python/cudf/cudf/_lib/cpp/io/avro.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/avro.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -11,17 +11,17 @@ cdef extern from "cudf/io/avro.hpp" \
         namespace "cudf::io" nogil:
 
     cdef cppclass avro_reader_options:
-        avro_reader_options() except+
-        cudf_io_types.source_info get_source() except+
-        vector[string] get_columns() except+
-        size_type get_skip_rows() except+
-        size_type get_num_rows() except+
+        avro_reader_options() except +
+        cudf_io_types.source_info get_source() except +
+        vector[string] get_columns() except +
+        size_type get_skip_rows() except +
+        size_type get_num_rows() except +
 
         # setters
 
-        void set_columns(vector[string] col_names) except+
-        void set_skip_rows(size_type val) except+
-        void set_num_rows(size_type val) except+
+        void set_columns(vector[string] col_names) except +
+        void set_skip_rows(size_type val) except +
+        void set_num_rows(size_type val) except +
 
         @staticmethod
         avro_reader_options_builder builder(
@@ -29,13 +29,13 @@ cdef extern from "cudf/io/avro.hpp" \
         ) except +
 
     cdef cppclass avro_reader_options_builder:
-        avro_reader_options_builder() except+
+        avro_reader_options_builder() except +
         avro_reader_options_builder(
             cudf_io_types.source_info src
         ) except +
-        avro_reader_options_builder& columns(vector[string] col_names) except+
-        avro_reader_options_builder& skip_rows(size_type val) except+
-        avro_reader_options_builder& num_rows(size_type val) except+
+        avro_reader_options_builder& columns(vector[string] col_names) except +
+        avro_reader_options_builder& skip_rows(size_type val) except +
+        avro_reader_options_builder& num_rows(size_type val) except +
 
         avro_reader_options build() except +
 
diff --git a/python/cudf/cudf/_lib/cpp/io/csv.pxd b/python/cudf/cudf/_lib/cpp/io/csv.pxd
index 4afd8732320..e7c0fec2e3d 100644
--- a/python/cudf/cudf/_lib/cpp/io/csv.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/csv.pxd
@@ -1,4 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint8_t
 from libcpp cimport bool
@@ -20,96 +20,96 @@ cdef extern from "cudf/io/csv.hpp" \
 
         # Getter
 
-        cudf_io_types.source_info get_source() except+
+        cudf_io_types.source_info get_source() except +
         # Reader settings
-        cudf_io_types.compression_type get_compression() except+
-        size_t get_byte_range_offset() except+
-        size_t get_byte_range_size() except+
-        vector[string] get_names() except+
-        string get_prefix() except+
-        bool is_enabled_mangle_dupe_cols() except+
+        cudf_io_types.compression_type get_compression() except +
+        size_t get_byte_range_offset() except +
+        size_t get_byte_range_size() except +
+        vector[string] get_names() except +
+        string get_prefix() except +
+        bool is_enabled_mangle_dupe_cols() except +
 
         # Filter settings
-        vector[string] get_use_cols_names() except+
-        vector[int] get_use_cols_indexes() except+
-        size_type get_nrows() except+
-        size_type get_skiprows() except+
-        size_type get_skipfooter() except+
-        size_type get_header() except+
+        vector[string] get_use_cols_names() except +
+        vector[int] get_use_cols_indexes() except +
+        size_type get_nrows() except +
+        size_type get_skiprows() except +
+        size_type get_skipfooter() except +
+        size_type get_header() except +
 
         # Parsing settings
-        char get_lineterminator() except+
-        char get_delimiter() except+
-        char get_thousands() except+
-        char get_decimal() except+
-        char get_comment() except+
-        bool is_enabled_windowslinetermination() except+
-        bool is_enabled_delim_whitespace() except+
-        bool is_enabled_skipinitialspace() except+
-        bool is_enabled_skip_blank_lines() except+
-        cudf_io_types.quote_style get_quoting() except+
-        char get_quotechar() except+
-        bool is_enabled_doublequote() except+
-        vector[string] get_parse_dates_names() except+
-        vector[int] get_parse_dates_indexes() except+
-        vector[string] get_parse_hex_names() except+
-        vector[int] get_parse_hex_indexes() except+
+        char get_lineterminator() except +
+        char get_delimiter() except +
+        char get_thousands() except +
+        char get_decimal() except +
+        char get_comment() except +
+        bool is_enabled_windowslinetermination() except +
+        bool is_enabled_delim_whitespace() except +
+        bool is_enabled_skipinitialspace() except +
+        bool is_enabled_skip_blank_lines() except +
+        cudf_io_types.quote_style get_quoting() except +
+        char get_quotechar() except +
+        bool is_enabled_doublequote() except +
+        vector[string] get_parse_dates_names() except +
+        vector[int] get_parse_dates_indexes() except +
+        vector[string] get_parse_hex_names() except +
+        vector[int] get_parse_hex_indexes() except +
 
         # Conversion settings
-        vector[string] get_dtype() except+
-        vector[string] get_true_values() except+
-        vector[string] get_false_values() except+
-        vector[string] get_na_values() except+
-        bool is_enabled_keep_default_na() except+
-        bool is_enabled_na_filter() except+
-        bool is_enabled_dayfirst() except+
+        vector[string] get_dtype() except +
+        vector[string] get_true_values() except +
+        vector[string] get_false_values() except +
+        vector[string] get_na_values() except +
+        bool is_enabled_keep_default_na() except +
+        bool is_enabled_na_filter() except +
+        bool is_enabled_dayfirst() except +
 
         # setter
 
         # Reader settings
-        void set_compression(cudf_io_types.compression_type comp) except+
-        void set_byte_range_offset(size_t val) except+
-        void set_byte_range_size(size_t val) except+
-        void set_names(vector[string] val) except+
-        void set_prefix(string pfx) except+
-        void set_mangle_dupe_cols(bool val) except+
+        void set_compression(cudf_io_types.compression_type comp) except +
+        void set_byte_range_offset(size_t val) except +
+        void set_byte_range_size(size_t val) except +
+        void set_names(vector[string] val) except +
+        void set_prefix(string pfx) except +
+        void set_mangle_dupe_cols(bool val) except +
 
         # Filter settings
-        void set_use_cols_names(vector[string] col_names) except+
-        void set_use_cols_indexes(vector[int] col_ind) except+
-        void set_nrows(size_type n_rows) except+
-        void set_skiprows(size_type val) except+
-        void set_skipfooter(size_type val) except+
-        void set_header(size_type hdr) except+
+        void set_use_cols_names(vector[string] col_names) except +
+        void set_use_cols_indexes(vector[int] col_ind) except +
+        void set_nrows(size_type n_rows) except +
+        void set_skiprows(size_type val) except +
+        void set_skipfooter(size_type val) except +
+        void set_header(size_type hdr) except +
 
         # Parsing settings
-        void set_lineterminator(char val) except+
-        void set_delimiter(char val) except+
-        void set_thousands(char val) except+
-        void set_decimal(char val) except+
-        void set_comment(char val) except+
-        void enable_windowslinetermination(bool val) except+
-        void enable_delim_whitespace(bool val) except+
-        void enable_skipinitialspace(bool val) except+
-        void enable_skip_blank_lines(bool val) except+
-        void set_quoting(cudf_io_types.quote_style style) except+
-        void set_quotechar(char val) except+
-        void set_doublequote(bool val) except+
-        void set_parse_dates(vector[string]) except+
-        void set_parse_dates(vector[int]) except+
-        void set_parse_hex(vector[string]) except+
-        void set_parse_hex(vector[int]) except+
+        void set_lineterminator(char val) except +
+        void set_delimiter(char val) except +
+        void set_thousands(char val) except +
+        void set_decimal(char val) except +
+        void set_comment(char val) except +
+        void enable_windowslinetermination(bool val) except +
+        void enable_delim_whitespace(bool val) except +
+        void enable_skipinitialspace(bool val) except +
+        void enable_skip_blank_lines(bool val) except +
+        void set_quoting(cudf_io_types.quote_style style) except +
+        void set_quotechar(char val) except +
+        void set_doublequote(bool val) except +
+        void set_parse_dates(vector[string]) except +
+        void set_parse_dates(vector[int]) except +
+        void set_parse_hex(vector[string]) except +
+        void set_parse_hex(vector[int]) except +
 
         # Conversion settings
-        void set_dtypes(vector[data_type] types) except+
-        void set_dtypes(map[string, data_type] types) except+
-        void set_true_values(vector[string] vals) except+
-        void set_false_values(vector[string] vals) except+
-        void set_na_values(vector[string] vals) except+
-        void enable_keep_default_na(bool val) except+
-        void enable_na_filter(bool val) except+
-        void enable_dayfirst(bool val) except+
-        void set_timestamp_type(data_type type) except+
+        void set_dtypes(vector[data_type] types) except +
+        void set_dtypes(map[string, data_type] types) except +
+        void set_true_values(vector[string] vals) except +
+        void set_false_values(vector[string] vals) except +
+        void set_na_values(vector[string] vals) except +
+        void enable_keep_default_na(bool val) except +
+        void enable_na_filter(bool val) except +
+        void enable_dayfirst(bool val) except +
+        void set_timestamp_type(data_type type) except +
 
         @staticmethod
         csv_reader_options_builder builder(
@@ -125,115 +125,115 @@ cdef extern from "cudf/io/csv.hpp" \
 
         csv_reader_options_builder& source(
             cudf_io_types.source_info info
-        ) except+
+        ) except +
         # Reader settings
         csv_reader_options_builder& compression(
             cudf_io_types.compression_type comp
-        ) except+
-        csv_reader_options_builder& byte_range_offset(size_t val) except+
-        csv_reader_options_builder& byte_range_size(size_t val) except+
-        csv_reader_options_builder& names(vector[string] val) except+
-        csv_reader_options_builder& prefix(string pfx) except+
-        csv_reader_options_builder& mangle_dupe_cols(bool val) except+
+        ) except +
+        csv_reader_options_builder& byte_range_offset(size_t val) except +
+        csv_reader_options_builder& byte_range_size(size_t val) except +
+        csv_reader_options_builder& names(vector[string] val) except +
+        csv_reader_options_builder& prefix(string pfx) except +
+        csv_reader_options_builder& mangle_dupe_cols(bool val) except +
 
         # Filter settings
         csv_reader_options_builder& use_cols_names(
             vector[string] col_names
-        ) except+
+        ) except +
         csv_reader_options_builder& use_cols_indexes(
             vector[int] col_ind
-        ) except+
-        csv_reader_options_builder& nrows(size_type n_rows) except+
-        csv_reader_options_builder& skiprows(size_type val) except+
-        csv_reader_options_builder& skipfooter(size_type val) except+
-        csv_reader_options_builder& header(size_type hdr) except+
+        ) except +
+        csv_reader_options_builder& nrows(size_type n_rows) except +
+        csv_reader_options_builder& skiprows(size_type val) except +
+        csv_reader_options_builder& skipfooter(size_type val) except +
+        csv_reader_options_builder& header(size_type hdr) except +
 
         # Parsing settings
-        csv_reader_options_builder& lineterminator(char val) except+
-        csv_reader_options_builder& delimiter(char val) except+
-        csv_reader_options_builder& thousands(char val) except+
-        csv_reader_options_builder& decimal(char val) except+
-        csv_reader_options_builder& comment(char val) except+
-        csv_reader_options_builder& windowslinetermination(bool val) except+
-        csv_reader_options_builder& delim_whitespace(bool val) except+
-        csv_reader_options_builder& skipinitialspace(bool val) except+
-        csv_reader_options_builder& skip_blank_lines(bool val) except+
+        csv_reader_options_builder& lineterminator(char val) except +
+        csv_reader_options_builder& delimiter(char val) except +
+        csv_reader_options_builder& thousands(char val) except +
+        csv_reader_options_builder& decimal(char val) except +
+        csv_reader_options_builder& comment(char val) except +
+        csv_reader_options_builder& windowslinetermination(bool val) except +
+        csv_reader_options_builder& delim_whitespace(bool val) except +
+        csv_reader_options_builder& skipinitialspace(bool val) except +
+        csv_reader_options_builder& skip_blank_lines(bool val) except +
         csv_reader_options_builder& quoting(
             cudf_io_types.quote_style style
-        ) except+
-        csv_reader_options_builder& quotechar(char val) except+
-        csv_reader_options_builder& doublequote(bool val) except+
-        csv_reader_options_builder& parse_dates(vector[string]) except+
-        csv_reader_options_builder& parse_dates(vector[int]) except+
+        ) except +
+        csv_reader_options_builder& quotechar(char val) except +
+        csv_reader_options_builder& doublequote(bool val) except +
+        csv_reader_options_builder& parse_dates(vector[string]) except +
+        csv_reader_options_builder& parse_dates(vector[int]) except +
 
         # Conversion settings
-        csv_reader_options_builder& dtypes(vector[string] types) except+
-        csv_reader_options_builder& dtypes(vector[data_type] types) except+
+        csv_reader_options_builder& dtypes(vector[string] types) except +
+        csv_reader_options_builder& dtypes(vector[data_type] types) except +
         csv_reader_options_builder& dtypes(
             map[string, data_type] types
-        ) except+
-        csv_reader_options_builder& true_values(vector[string] vals) except+
-        csv_reader_options_builder& false_values(vector[string] vals) except+
-        csv_reader_options_builder& na_values(vector[string] vals) except+
-        csv_reader_options_builder& keep_default_na(bool val) except+
-        csv_reader_options_builder& na_filter(bool val) except+
-        csv_reader_options_builder& dayfirst(bool val) except+
-        csv_reader_options_builder& timestamp_type(data_type type) except+
+        ) except +
+        csv_reader_options_builder& true_values(vector[string] vals) except +
+        csv_reader_options_builder& false_values(vector[string] vals) except +
+        csv_reader_options_builder& na_values(vector[string] vals) except +
+        csv_reader_options_builder& keep_default_na(bool val) except +
+        csv_reader_options_builder& na_filter(bool val) except +
+        csv_reader_options_builder& dayfirst(bool val) except +
+        csv_reader_options_builder& timestamp_type(data_type type) except +
 
-        csv_reader_options build() except+
+        csv_reader_options build() except +
 
     cdef cudf_io_types.table_with_metadata read_csv(
         csv_reader_options &options
     ) except +
 
     cdef cppclass csv_writer_options:
-        csv_writer_options() except+
-
-        cudf_io_types.sink_info get_sink() except+
-        cudf_table_view.table_view get_table() except+
-        cudf_io_types.table_metadata get_metadata() except+
-        string get_na_rep() except+
-        bool is_enabled_include_header() except+
-        size_type get_rows_per_chunk() except+
-        string get_line_terminator() except+
-        char get_inter_column_delimiter() except+
-        string get_true_value() except+
-        string get_false_value() except+
+        csv_writer_options() except +
+
+        cudf_io_types.sink_info get_sink() except +
+        cudf_table_view.table_view get_table() except +
+        cudf_io_types.table_metadata get_metadata() except +
+        string get_na_rep() except +
+        bool is_enabled_include_header() except +
+        size_type get_rows_per_chunk() except +
+        string get_line_terminator() except +
+        char get_inter_column_delimiter() except +
+        string get_true_value() except +
+        string get_false_value() except +
+        vector[string] get_names() except +
 
         # setter
-        void set_metadata(cudf_io_types.table_metadata* val) except+
-        void set_na_rep(string val) except+
-        void enable_include_header(bool val) except+
-        void set_rows_per_chunk(size_type val) except+
-        void set_line_terminator(string term) except+
-        void set_inter_column_delimiter(char delim) except+
-        void set__true_value(string val) except+
-        void set_false_value(string val) except+
+        void set_metadata(cudf_io_types.table_metadata* val) except +
+        void set_na_rep(string val) except +
+        void enable_include_header(bool val) except +
+        void set_rows_per_chunk(size_type val) except +
+        void set_line_terminator(string term) except +
+        void set_inter_column_delimiter(char delim) except +
+        void set_true_value(string val) except +
+        void set_false_value(string val) except +
+        void set_names(vector[string] val) except +
 
         @staticmethod
         csv_writer_options_builder builder(
             cudf_io_types.sink_info sink,
             cudf_table_view.table_view table
-        ) except+
+        ) except +
 
     cdef cppclass csv_writer_options_builder:
-        csv_writer_options_builder() except+
+        csv_writer_options_builder() except +
         csv_writer_options_builder(
             cudf_io_types.sink_info sink,
             cudf_table_view.table_view table
-        ) except+
-
-        csv_writer_options_builder& metadata(
-            cudf_io_types.table_metadata* val
-        ) except+
-        csv_writer_options_builder& na_rep(string val) except+
-        csv_writer_options_builder& include_header(bool val) except+
-        csv_writer_options_builder& rows_per_chunk(size_type val) except+
-        csv_writer_options_builder& line_terminator(string term) except+
-        csv_writer_options_builder& inter_column_delimiter(char delim) except+
-        csv_writer_options_builder& true_value(string val) except+
-        csv_writer_options_builder& false_value(string val) except+
-
-        csv_writer_options build() except+
+        ) except +
+
+        csv_writer_options_builder& names(vector[string] val) except +
+        csv_writer_options_builder& na_rep(string val) except +
+        csv_writer_options_builder& include_header(bool val) except +
+        csv_writer_options_builder& rows_per_chunk(size_type val) except +
+        csv_writer_options_builder& line_terminator(string term) except +
+        csv_writer_options_builder& inter_column_delimiter(char delim) except +
+        csv_writer_options_builder& true_value(string val) except +
+        csv_writer_options_builder& false_value(string val) except +
+
+        csv_writer_options build() except +
 
     cdef void write_csv(csv_writer_options args) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/json.pxd b/python/cudf/cudf/_lib/cpp/io/json.pxd
index 7333aad7ddf..ab87e2cbb4b 100644
--- a/python/cudf/cudf/_lib/cpp/io/json.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/json.pxd
@@ -20,71 +20,71 @@ cdef extern from "cudf/io/json.hpp" \
         map[string, schema_element] child_types
 
     cdef cppclass json_reader_options:
-        json_reader_options() except+
-        cudf_io_types.source_info get_source() except+
-        vector[string] get_dtypes() except+
+        json_reader_options() except +
+        cudf_io_types.source_info get_source() except +
+        vector[string] get_dtypes() except +
         cudf_io_types.compression_type get_compression() except +
-        size_type get_byte_range_offset() except+
-        size_type get_byte_range_size() except+
-        bool is_enabled_lines() except+
-        bool is_enabled_dayfirst() except+
-        bool is_enabled_experimental() except+
+        size_type get_byte_range_offset() except +
+        size_type get_byte_range_size() except +
+        bool is_enabled_lines() except +
+        bool is_enabled_dayfirst() except +
+        bool is_enabled_experimental() except +
 
         # setter
-        void set_dtypes(vector[data_type] types) except+
-        void set_dtypes(map[string, schema_element] types) except+
+        void set_dtypes(vector[data_type] types) except +
+        void set_dtypes(map[string, schema_element] types) except +
         void set_compression(
             cudf_io_types.compression_type compression
-        ) except+
-        void set_byte_range_offset(size_type offset) except+
-        void set_byte_range_size(size_type size) except+
-        void enable_lines(bool val) except+
-        void enable_dayfirst(bool val) except+
-        void enable_experimental(bool val) except+
-        void enable_keep_quotes(bool val) except+
+        ) except +
+        void set_byte_range_offset(size_type offset) except +
+        void set_byte_range_size(size_type size) except +
+        void enable_lines(bool val) except +
+        void enable_dayfirst(bool val) except +
+        void enable_experimental(bool val) except +
+        void enable_keep_quotes(bool val) except +
 
         @staticmethod
         json_reader_options_builder builder(
             cudf_io_types.source_info src
-        ) except+
+        ) except +
 
     cdef cppclass json_reader_options_builder:
-        json_reader_options_builder() except+
+        json_reader_options_builder() except +
         json_reader_options_builder(
             cudf_io_types.source_info src
-        ) except+
+        ) except +
         json_reader_options_builder& dtypes(
             vector[string] types
-        ) except+
+        ) except +
         json_reader_options_builder& dtypes(
             vector[data_type] types
-        ) except+
+        ) except +
         json_reader_options_builder& dtypes(
             map[string, schema_element] types
-        ) except+
+        ) except +
         json_reader_options_builder& compression(
             cudf_io_types.compression_type compression
-        ) except+
+        ) except +
         json_reader_options_builder& byte_range_offset(
             size_type offset
-        ) except+
+        ) except +
         json_reader_options_builder& byte_range_size(
             size_type size
-        ) except+
+        ) except +
         json_reader_options_builder& lines(
             bool val
-        ) except+
+        ) except +
         json_reader_options_builder& dayfirst(
             bool val
-        ) except+
+        ) except +
         json_reader_options_builder& experimental(
             bool val
-        ) except+
+        ) except +
         json_reader_options_builder& keep_quotes(
             bool val
-        ) except+
+        ) except +
 
-        json_reader_options build() except+
+        json_reader_options build() except +
 
     cdef cudf_io_types.table_with_metadata read_json(
-        json_reader_options &options) except+
+        json_reader_options &options) except +
diff --git a/python/cudf/cudf/_lib/cpp/io/orc.pxd b/python/cudf/cudf/_lib/cpp/io/orc.pxd
index 3e44ef98348..ec26fff3779 100644
--- a/python/cudf/cudf/_lib/cpp/io/orc.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/orc.pxd
@@ -16,45 +16,45 @@ cdef extern from "cudf/io/orc.hpp" \
         namespace "cudf::io" nogil:
 
     cdef cppclass orc_reader_options:
-        orc_reader_options() except+
-
-        cudf_io_types.source_info get_source() except+
-        vector[vector[size_type]] get_stripes() except+
-        size_type get_skip_rows() except+
-        size_type get_num_rows() except+
-        bool is_enabled_use_index() except+
-        bool is_enabled_use_np_dtypes() except+
-        data_type get_timestamp_type() except+
-        bool is_enabled_decimals_as_float64() except+
-        int get_forced_decimals_scale() except+
-
-        void set_columns(vector[string] col_names) except+
-        void set_stripes(vector[vector[size_type]] strps) except+
-        void set_skip_rows(size_type rows) except+
-        void set_num_rows(size_type nrows) except+
-        void enable_use_index(bool val) except+
-        void enable_use_np_dtypes(bool val) except+
-        void set_timestamp_type(data_type type) except+
+        orc_reader_options() except +
+
+        cudf_io_types.source_info get_source() except +
+        vector[vector[size_type]] get_stripes() except +
+        size_type get_skip_rows() except +
+        size_type get_num_rows() except +
+        bool is_enabled_use_index() except +
+        bool is_enabled_use_np_dtypes() except +
+        data_type get_timestamp_type() except +
+        bool is_enabled_decimals_as_float64() except +
+        int get_forced_decimals_scale() except +
+
+        void set_columns(vector[string] col_names) except +
+        void set_stripes(vector[vector[size_type]] strps) except +
+        void set_skip_rows(size_type rows) except +
+        void set_num_rows(size_type nrows) except +
+        void enable_use_index(bool val) except +
+        void enable_use_np_dtypes(bool val) except +
+        void set_timestamp_type(data_type type) except +
 
         @staticmethod
         orc_reader_options_builder builder(
             cudf_io_types.source_info src
-        ) except+
+        ) except +
 
     cdef cppclass orc_reader_options_builder:
-        orc_reader_options_builder() except+
-        orc_reader_options_builder(cudf_io_types.source_info &src) except+
+        orc_reader_options_builder() except +
+        orc_reader_options_builder(cudf_io_types.source_info &src) except +
 
-        orc_reader_options_builder& columns(vector[string] col_names) except+
+        orc_reader_options_builder& columns(vector[string] col_names) except +
         orc_reader_options_builder& \
-            stripes(vector[vector[size_type]] strps) except+
-        orc_reader_options_builder& skip_rows(size_type rows) except+
-        orc_reader_options_builder& num_rows(size_type nrows) except+
-        orc_reader_options_builder& use_index(bool val) except+
-        orc_reader_options_builder& use_np_dtypes(bool val) except+
-        orc_reader_options_builder& timestamp_type(data_type type) except+
+            stripes(vector[vector[size_type]] strps) except +
+        orc_reader_options_builder& skip_rows(size_type rows) except +
+        orc_reader_options_builder& num_rows(size_type nrows) except +
+        orc_reader_options_builder& use_index(bool val) except +
+        orc_reader_options_builder& use_np_dtypes(bool val) except +
+        orc_reader_options_builder& timestamp_type(data_type type) except +
 
-        orc_reader_options build() except+
+        orc_reader_options build() except +
 
     cdef cudf_io_types.table_with_metadata read_orc(
         orc_reader_options opts
@@ -62,108 +62,110 @@ cdef extern from "cudf/io/orc.hpp" \
 
     cdef cppclass orc_writer_options:
         orc_writer_options()
-        cudf_io_types.sink_info get_sink() except+
-        cudf_io_types.compression_type get_compression() except+
-        bool is_enabled_statistics() except+
-        size_t get_stripe_size_bytes() except+
-        size_type get_stripe_size_rows() except+
-        size_type get_row_index_stride() except+
-        cudf_table_view.table_view get_table() except+
-        const cudf_io_types.table_input_metadata *get_metadata() except+
+        cudf_io_types.sink_info get_sink() except +
+        cudf_io_types.compression_type get_compression() except +
+        bool is_enabled_statistics() except +
+        size_t get_stripe_size_bytes() except +
+        size_type get_stripe_size_rows() except +
+        size_type get_row_index_stride() except +
+        cudf_table_view.table_view get_table() except +
+        const cudf_io_types.table_input_metadata *get_metadata() except +
 
         # setter
-        void set_compression(cudf_io_types.compression_type comp) except+
-        void enable_statistics(bool val) except+
-        void set_stripe_size_bytes(size_t val) except+
-        void set_stripe_size_rows(size_type val) except+
-        void set_row_index_stride(size_type val) except+
-        void set_table(cudf_table_view.table_view tbl) except+
-        void set_metadata(cudf_io_types.table_input_metadata* meta) except+
+        void set_compression(cudf_io_types.compression_type comp) except +
+        void enable_statistics(bool val) except +
+        void set_stripe_size_bytes(size_t val) except +
+        void set_stripe_size_rows(size_type val) except +
+        void set_row_index_stride(size_type val) except +
+        void set_table(cudf_table_view.table_view tbl) except +
+        void set_metadata(cudf_io_types.table_input_metadata* meta) except +
         void set_key_value_metadata(map[string, string] kvm) except +
 
         @staticmethod
         orc_writer_options_builder builder(
             cudf_io_types.sink_info &sink,
             cudf_table_view.table_view &tbl
-        ) except+
+        ) except +
 
     cdef cppclass orc_writer_options_builder:
         # setter
         orc_writer_options_builder& compression(
             cudf_io_types.compression_type comp
-        ) except+
-        orc_writer_options_builder& enable_statistics(bool val) except+
-        orc_writer_options_builder& stripe_size_bytes(size_t val) except+
-        orc_writer_options_builder& stripe_size_rows(size_type val) except+
-        orc_writer_options_builder& row_index_stride(size_type val) except+
+        ) except +
+        orc_writer_options_builder& enable_statistics(bool val) except +
+        orc_writer_options_builder& stripe_size_bytes(size_t val) except +
+        orc_writer_options_builder& stripe_size_rows(size_type val) except +
+        orc_writer_options_builder& row_index_stride(size_type val) except +
         orc_writer_options_builder& table(
             cudf_table_view.table_view tbl
-        ) except+
+        ) except +
         orc_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *meta
-        ) except+
+        ) except +
         orc_writer_options_builder& key_value_metadata(
             map[string, string] kvm
-        ) except+
+        ) except +
 
-        orc_writer_options build() except+
+        orc_writer_options build() except +
 
     cdef void write_orc(orc_writer_options options) except +
 
     cdef cppclass chunked_orc_writer_options:
-        chunked_orc_writer_options() except+
-        cudf_io_types.sink_info get_sink() except+
-        cudf_io_types.compression_type get_compression() except+
-        bool enable_statistics() except+
-        size_t stripe_size_bytes() except+
-        size_type stripe_size_rows() except+
-        size_type row_index_stride() except+
-        cudf_table_view.table_view get_table() except+
+        chunked_orc_writer_options() except +
+        cudf_io_types.sink_info get_sink() except +
+        cudf_io_types.compression_type get_compression() except +
+        bool enable_statistics() except +
+        size_t stripe_size_bytes() except +
+        size_type stripe_size_rows() except +
+        size_type row_index_stride() except +
+        cudf_table_view.table_view get_table() except +
         const cudf_io_types.table_input_metadata *get_metadata(
-        ) except+
+        ) except +
 
         # setter
-        void set_compression(cudf_io_types.compression_type comp) except+
-        void enable_statistics(bool val) except+
-        void set_stripe_size_bytes(size_t val) except+
-        void set_stripe_size_rows(size_type val) except+
-        void set_row_index_stride(size_type val) except+
-        void set_table(cudf_table_view.table_view tbl) except+
+        void set_compression(cudf_io_types.compression_type comp) except +
+        void enable_statistics(bool val) except +
+        void set_stripe_size_bytes(size_t val) except +
+        void set_stripe_size_rows(size_type val) except +
+        void set_row_index_stride(size_type val) except +
+        void set_table(cudf_table_view.table_view tbl) except +
         void set_metadata(
             cudf_io_types.table_input_metadata* meta
-        ) except+
+        ) except +
         void set_key_value_metadata(map[string, string] kvm) except +
 
         @staticmethod
         chunked_orc_writer_options_builder builder(
             cudf_io_types.sink_info &sink
-        ) except+
+        ) except +
 
     cdef cppclass chunked_orc_writer_options_builder:
         # setter
         chunked_orc_writer_options_builder& compression(
             cudf_io_types.compression_type comp
-        ) except+
-        chunked_orc_writer_options_builder& enable_statistics(bool val) except+
-        orc_writer_options_builder& stripe_size_bytes(size_t val) except+
-        orc_writer_options_builder& stripe_size_rows(size_type val) except+
-        orc_writer_options_builder& row_index_stride(size_type val) except+
+        ) except +
+        chunked_orc_writer_options_builder& enable_statistics(
+            bool val
+        ) except +
+        orc_writer_options_builder& stripe_size_bytes(size_t val) except +
+        orc_writer_options_builder& stripe_size_rows(size_type val) except +
+        orc_writer_options_builder& row_index_stride(size_type val) except +
         chunked_orc_writer_options_builder& table(
             cudf_table_view.table_view tbl
-        ) except+
+        ) except +
         chunked_orc_writer_options_builder& metadata(
             cudf_io_types.table_input_metadata *meta
-        ) except+
+        ) except +
         chunked_orc_writer_options_builder& key_value_metadata(
             map[string, string] kvm
-        ) except+
+        ) except +
 
-        chunked_orc_writer_options build() except+
+        chunked_orc_writer_options build() except +
 
     cdef cppclass orc_chunked_writer:
-        orc_chunked_writer() except+
-        orc_chunked_writer(chunked_orc_writer_options args) except+
+        orc_chunked_writer() except +
+        orc_chunked_writer(chunked_orc_writer_options args) except +
         orc_chunked_writer& write(
             cudf_table_view.table_view table_,
-        ) except+
-        void close() except+
+        ) except +
+        void close() except +
diff --git a/python/cudf/cudf/_lib/cpp/io/parquet.pxd b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
index f388fff3beb..98b839ba9b8 100644
--- a/python/cudf/cudf/_lib/cpp/io/parquet.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/parquet.pxd
@@ -66,11 +66,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_table_view.table_view get_table() except +
         const cudf_io_types.table_input_metadata get_metadata() except +
-        string get_column_chunks_file_paths() except+
-        size_t get_row_group_size_bytes() except+
-        size_type get_row_group_size_rows() except+
-        size_t get_max_page_size_bytes() except+
-        size_type get_max_page_size_rows() except+
+        string get_column_chunks_file_paths() except +
+        size_t get_row_group_size_bytes() except +
+        size_type get_row_group_size_rows() except +
+        size_t get_max_page_size_bytes() except +
+        size_type get_max_page_size_rows() except +
 
         void set_partitions(
             vector[cudf_io_types.partition_info] partitions
@@ -90,10 +90,10 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_column_chunks_file_paths(
             vector[string] column_chunks_file_paths
         ) except +
-        void set_row_group_size_bytes(size_t val) except+
-        void set_row_group_size_rows(size_type val) except+
-        void set_max_page_size_bytes(size_t val) except+
-        void set_max_page_size_rows(size_type val) except+
+        void set_row_group_size_bytes(size_t val) except +
+        void set_row_group_size_rows(size_type val) except +
+        void set_max_page_size_bytes(size_t val) except +
+        void set_max_page_size_rows(size_type val) except +
 
         @staticmethod
         parquet_writer_options_builder builder(
@@ -131,16 +131,16 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         ) except +
         parquet_writer_options_builder& row_group_size_bytes(
             size_t val
-        ) except+
+        ) except +
         parquet_writer_options_builder& row_group_size_rows(
             size_type val
-        ) except+
+        ) except +
         parquet_writer_options_builder& max_page_size_bytes(
             size_t val
-        ) except+
+        ) except +
         parquet_writer_options_builder& max_page_size_rows(
             size_type val
-        ) except+
+        ) except +
 
         parquet_writer_options build() except +
 
@@ -154,11 +154,11 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         cudf_io_types.compression_type get_compression() except +
         cudf_io_types.statistics_freq get_stats_level() except +
         cudf_io_types.table_input_metadata* get_metadata(
-        ) except+
-        size_t get_row_group_size_bytes() except+
-        size_type get_row_group_size_rows() except+
-        size_t get_max_page_size_bytes() except+
-        size_type get_max_page_size_rows() except+
+        ) except +
+        size_t get_row_group_size_bytes() except +
+        size_type get_row_group_size_rows() except +
+        size_t get_max_page_size_bytes() except +
+        size_type get_max_page_size_rows() except +
 
         void set_metadata(
             cudf_io_types.table_input_metadata *m
@@ -172,10 +172,10 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         void set_compression(
             cudf_io_types.compression_type compression
         ) except +
-        void set_row_group_size_bytes(size_t val) except+
-        void set_row_group_size_rows(size_type val) except+
-        void set_max_page_size_bytes(size_t val) except+
-        void set_max_page_size_rows(size_type val) except+
+        void set_row_group_size_bytes(size_t val) except +
+        void set_row_group_size_rows(size_type val) except +
+        void set_max_page_size_bytes(size_t val) except +
+        void set_max_page_size_rows(size_type val) except +
 
         @staticmethod
         chunked_parquet_writer_options_builder builder(
@@ -201,32 +201,32 @@ cdef extern from "cudf/io/parquet.hpp" namespace "cudf::io" nogil:
         ) except +
         chunked_parquet_writer_options_builder& row_group_size_bytes(
             size_t val
-        ) except+
+        ) except +
         chunked_parquet_writer_options_builder& row_group_size_rows(
             size_type val
-        ) except+
+        ) except +
         chunked_parquet_writer_options_builder& max_page_size_bytes(
             size_t val
-        ) except+
+        ) except +
         chunked_parquet_writer_options_builder& max_page_size_rows(
             size_type val
-        ) except+
+        ) except +
 
         chunked_parquet_writer_options build() except +
 
     cdef cppclass parquet_chunked_writer:
-        parquet_chunked_writer() except+
-        parquet_chunked_writer(chunked_parquet_writer_options args) except+
+        parquet_chunked_writer() except +
+        parquet_chunked_writer(chunked_parquet_writer_options args) except +
         parquet_chunked_writer& write(
             cudf_table_view.table_view table_,
-        ) except+
+        ) except +
         parquet_chunked_writer& write(
             const cudf_table_view.table_view& table_,
             const vector[cudf_io_types.partition_info]& partitions,
-        ) except+
+        ) except +
         unique_ptr[vector[uint8_t]] close(
             vector[string] column_chunks_file_paths,
-        ) except+
+        ) except +
 
     cdef unique_ptr[vector[uint8_t]] merge_row_group_metadata(
         const vector[unique_ptr[vector[uint8_t]]]& metadata_list
diff --git a/python/cudf/cudf/_lib/cpp/io/text.pxd b/python/cudf/cudf/_lib/cpp/io/text.pxd
index 5b110d6234c..368b014ea4b 100644
--- a/python/cudf/cudf/_lib/cpp/io/text.pxd
+++ b/python/cudf/cudf/_lib/cpp/io/text.pxd
@@ -1,5 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+from libc.stdint cimport uint64_t
+from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 
@@ -25,14 +27,23 @@ cdef extern from "cudf/io/text/data_chunk_source_factories.hpp" \
     unique_ptr[data_chunk_source] make_source(string data) except +
     unique_ptr[data_chunk_source] \
         make_source_from_file(string filename) except +
+    unique_ptr[data_chunk_source] \
+        make_source_from_bgzip_file(string filename) except +
+    unique_ptr[data_chunk_source] \
+        make_source_from_bgzip_file(string filename,
+                                    uint64_t virtual_begin,
+                                    uint64_t virtual_end) except +
 
 
 cdef extern from "cudf/io/text/multibyte_split.hpp" \
         namespace "cudf::io::text" nogil:
 
-    unique_ptr[column] multibyte_split(data_chunk_source source,
-                                       string delimiter) except +
+    cdef cppclass parse_options:
+        byte_range_info byte_range
+        bool strip_delimiters
+
+        parse_options() except +
 
     unique_ptr[column] multibyte_split(data_chunk_source source,
                                        string delimiter,
-                                       byte_range_info byte_range) except +
+                                       parse_options options) except +
diff --git a/python/cudf/cudf/_lib/cpp/null_mask.pxd b/python/cudf/cudf/_lib/cpp/null_mask.pxd
index c225a16297b..3050a9f3459 100644
--- a/python/cudf/cudf/_lib/cpp/null_mask.pxd
+++ b/python/cudf/cudf/_lib/cpp/null_mask.pxd
@@ -1,11 +1,13 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
+from libcpp.pair cimport pair
 
 from rmm._lib.device_buffer cimport device_buffer
 
-cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.table.table_view cimport table_view
+from cudf._lib.cpp.types cimport mask_state, size_type
 
 ctypedef int32_t underlying_type_t_mask_state
 
@@ -16,15 +18,23 @@ cdef extern from "cudf/null_mask.hpp" namespace "cudf" nogil:
     ) except +
 
     cdef size_t bitmask_allocation_size_bytes (
-        libcudf_types.size_type number_of_bits,
+        size_type number_of_bits,
         size_t padding_boundary
     ) except +
 
     cdef size_t bitmask_allocation_size_bytes (
-        libcudf_types.size_type number_of_bits
+        size_type number_of_bits
     ) except +
 
     cdef device_buffer create_null_mask (
-        libcudf_types.size_type size,
-        libcudf_types.mask_state state
+        size_type size,
+        mask_state state
     ) except +
+
+    cdef pair[device_buffer, size_type] bitmask_and(
+        table_view view
+    )
+
+    cdef pair[device_buffer, size_type] bitmask_or(
+        table_view view
+    )
diff --git a/python/cudf/cudf/_lib/cpp/reduce.pxd b/python/cudf/cudf/_lib/cpp/reduce.pxd
index 22ae09346ed..7952c717916 100644
--- a/python/cudf/cudf/_lib/cpp/reduce.pxd
+++ b/python/cudf/cudf/_lib/cpp/reduce.pxd
@@ -14,7 +14,7 @@ from cudf._lib.scalar cimport DeviceScalar
 cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:
     cdef unique_ptr[scalar] cpp_reduce "cudf::reduce" (
         column_view col,
-        const unique_ptr[reduce_aggregation] agg,
+        const reduce_aggregation& agg,
         data_type type
     ) except +
 
@@ -24,7 +24,7 @@ cdef extern from "cudf/reduction.hpp" namespace "cudf" nogil:
 
     cdef unique_ptr[column] cpp_scan "cudf::scan" (
         column_view col,
-        const unique_ptr[scan_aggregation] agg,
+        const scan_aggregation& agg,
         scan_type inclusive
     ) except +
 
diff --git a/python/cudf/cudf/_lib/csv.pyx b/python/cudf/cudf/_lib/csv.pyx
index f1a75baa951..403b7acde5c 100644
--- a/python/cudf/cudf/_lib/csv.pyx
+++ b/python/cudf/cudf/_lib/csv.pyx
@@ -2,13 +2,13 @@
 
 from libcpp cimport bool
 from libcpp.map cimport map
-from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.cpp.types cimport data_type, type_id
+from cudf._lib.cpp.types cimport data_type
 from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource
 from cudf._lib.types cimport dtype_to_data_type
 
@@ -40,16 +40,11 @@ from cudf._lib.cpp.io.types cimport (
     quote_style,
     sink_info,
     source_info,
-    table_metadata,
     table_with_metadata,
 )
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.io.utils cimport make_sink_info, make_source_info
-from cudf._lib.utils cimport (
-    data_from_unique_ptr,
-    table_view_from_columns,
-    table_view_from_table,
-)
+from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
 from pyarrow.lib import NativeFile
 
@@ -121,8 +116,6 @@ cdef csv_reader_options make_csv_reader_options(
 ) except *:
     cdef source_info c_source_info = make_source_info([datasource])
     cdef compression_type c_compression
-    cdef size_type c_header
-    cdef string c_prefix
     cdef vector[string] c_names
     cdef size_t c_byte_range_offset = (
         byte_range[0] if byte_range is not None else 0
@@ -304,7 +297,7 @@ cdef csv_reader_options make_csv_reader_options(
 
     if false_values is not None:
         c_false_values.reserve(len(false_values))
-        for fv in c_false_values:
+        for fv in false_values:
             c_false_values.push_back(fv.encode())
         csv_reader_options_c.set_false_values(c_false_values)
 
@@ -475,7 +468,7 @@ cpdef write_csv(
     cdef string line_term_c = line_terminator.encode()
     cdef string na_c = na_rep.encode()
     cdef int rows_per_chunk_c = rows_per_chunk
-    cdef table_metadata metadata_ = table_metadata()
+    cdef vector[string] col_names
     cdef string true_value_c = 'True'.encode()
     cdef string false_value_c = 'False'.encode()
     cdef unique_ptr[data_sink] data_sink_c
@@ -487,26 +480,26 @@ cpdef write_csv(
             all_names = table._index.names + all_names
 
         if len(all_names) > 0:
-            metadata_.column_names.reserve(len(all_names))
+            col_names.reserve(len(all_names))
             if len(all_names) == 1:
                 if all_names[0] in (None, ''):
-                    metadata_.column_names.push_back('""'.encode())
+                    col_names.push_back('""'.encode())
                 else:
-                    metadata_.column_names.push_back(
+                    col_names.push_back(
                         str(all_names[0]).encode()
                     )
             else:
                 for idx, col_name in enumerate(all_names):
                     if col_name is None:
-                        metadata_.column_names.push_back(''.encode())
+                        col_names.push_back(''.encode())
                     else:
-                        metadata_.column_names.push_back(
+                        col_names.push_back(
                             str(col_name).encode()
                         )
 
     cdef csv_writer_options options = move(
         csv_writer_options.builder(sink_info_c, input_table_view)
-        .metadata(&metadata_)
+        .names(col_names)
         .na_rep(na_c)
         .include_header(include_header_c)
         .rows_per_chunk(rows_per_chunk_c)
diff --git a/python/cudf/cudf/_lib/datetime.pyx b/python/cudf/cudf/_lib/datetime.pyx
index e218400a2db..81949dbaa20 100644
--- a/python/cudf/cudf/_lib/datetime.pyx
+++ b/python/cudf/cudf/_lib/datetime.pyx
@@ -1,5 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+from cudf.core.buffer import acquire_spill_lock
+
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -12,6 +14,7 @@ from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def add_months(Column col, Column months):
     # months must be int16 dtype
     cdef unique_ptr[column] c_result
@@ -29,6 +32,7 @@ def add_months(Column col, Column months):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def extract_datetime_component(Column col, object field):
 
     cdef unique_ptr[column] c_result
@@ -49,6 +53,18 @@ def extract_datetime_component(Column col, object field):
             c_result = move(libcudf_datetime.extract_minute(col_view))
         elif field == "second":
             c_result = move(libcudf_datetime.extract_second(col_view))
+        elif field == "millisecond":
+            c_result = move(
+                libcudf_datetime.extract_millisecond_fraction(col_view)
+            )
+        elif field == "microsecond":
+            c_result = move(
+                libcudf_datetime.extract_microsecond_fraction(col_view)
+            )
+        elif field == "nanosecond":
+            c_result = move(
+                libcudf_datetime.extract_nanosecond_fraction(col_view)
+            )
         elif field == "day_of_year":
             c_result = move(libcudf_datetime.day_of_year(col_view))
         else:
@@ -87,6 +103,7 @@ cdef libcudf_datetime.rounding_frequency _get_rounding_frequency(object freq):
     return freq_val
 
 
+@acquire_spill_lock()
 def ceil_datetime(Column col, object freq):
     cdef unique_ptr[column] c_result
     cdef column_view col_view = col.view()
@@ -100,6 +117,7 @@ def ceil_datetime(Column col, object freq):
     return result
 
 
+@acquire_spill_lock()
 def floor_datetime(Column col, object freq):
     cdef unique_ptr[column] c_result
     cdef column_view col_view = col.view()
@@ -113,6 +131,7 @@ def floor_datetime(Column col, object freq):
     return result
 
 
+@acquire_spill_lock()
 def round_datetime(Column col, object freq):
     cdef unique_ptr[column] c_result
     cdef column_view col_view = col.view()
@@ -126,6 +145,7 @@ def round_datetime(Column col, object freq):
     return result
 
 
+@acquire_spill_lock()
 def is_leap_year(Column col):
     """Returns a boolean indicator whether the year of the date is a leap year
     """
@@ -138,6 +158,7 @@ def is_leap_year(Column col):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def date_range(DeviceScalar start, size_type n, offset):
     cdef unique_ptr[column] c_result
     cdef size_type months = (
@@ -154,6 +175,7 @@ def date_range(DeviceScalar start, size_type n, offset):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def extract_quarter(Column col):
     """
     Returns a column which contains the corresponding quarter of the year
@@ -168,6 +190,7 @@ def extract_quarter(Column col):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def days_in_month(Column col):
     """Extracts the number of days in the month of the date
     """
@@ -180,6 +203,7 @@ def days_in_month(Column col):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def last_day_of_month(Column col):
     cdef unique_ptr[column] c_result
     cdef column_view col_view = col.view()
diff --git a/python/cudf/cudf/_lib/expressions.pyx b/python/cudf/cudf/_lib/expressions.pyx
index f069bcdbe73..269318240b2 100644
--- a/python/cudf/cudf/_lib/expressions.pyx
+++ b/python/cudf/cudf/_lib/expressions.pyx
@@ -5,7 +5,6 @@ from enum import Enum
 from cython.operator cimport dereference
 from libc.stdint cimport int64_t
 from libcpp.memory cimport make_unique, unique_ptr
-from libcpp.utility cimport move
 
 from cudf._lib.cpp cimport expressions as libcudf_exp
 from cudf._lib.cpp.types cimport size_type
diff --git a/python/cudf/cudf/_lib/filling.pyx b/python/cudf/cudf/_lib/filling.pyx
index 7de63def6a6..63549f08cbd 100644
--- a/python/cudf/cudf/_lib/filling.pyx
+++ b/python/cudf/cudf/_lib/filling.pyx
@@ -1,8 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import numpy as np
+from cudf.core.buffer import acquire_spill_lock
 
-from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -15,13 +14,10 @@ from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport (
-    columns_from_unique_ptr,
-    data_from_unique_ptr,
-    table_view_from_columns,
-)
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
+@acquire_spill_lock()
 def fill_in_place(Column destination, int begin, int end, DeviceScalar value):
     cdef mutable_column_view c_destination = destination.mutable_view()
     cdef size_type c_begin = <size_type> begin
@@ -36,6 +32,7 @@ def fill_in_place(Column destination, int begin, int end, DeviceScalar value):
     )
 
 
+@acquire_spill_lock()
 def fill(Column destination, int begin, int end, DeviceScalar value):
     cdef column_view c_destination = destination.view()
     cdef size_type c_begin = <size_type> begin
@@ -54,24 +51,23 @@ def fill(Column destination, int begin, int end, DeviceScalar value):
     return Column.from_unique_ptr(move(c_result))
 
 
-def repeat(list inp, object count, bool check_count=False):
+@acquire_spill_lock()
+def repeat(list inp, object count):
     if isinstance(count, Column):
-        return _repeat_via_column(inp, count, check_count)
+        return _repeat_via_column(inp, count)
     else:
         return _repeat_via_size_type(inp, count)
 
 
-def _repeat_via_column(list inp, Column count, bool check_count):
+def _repeat_via_column(list inp, Column count):
     cdef table_view c_inp = table_view_from_columns(inp)
     cdef column_view c_count = count.view()
-    cdef bool c_check_count = check_count
     cdef unique_ptr[table] c_result
 
     with nogil:
         c_result = move(cpp_filling.repeat(
             c_inp,
             c_count,
-            c_check_count
         ))
 
     return columns_from_unique_ptr(move(c_result))
@@ -90,6 +86,7 @@ def _repeat_via_size_type(list inp, size_type count):
     return columns_from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def sequence(int size, DeviceScalar init, DeviceScalar step):
     cdef size_type c_size = size
     cdef const scalar* c_init = init.get_raw_ptr()
diff --git a/python/cudf/cudf/_lib/groupby.pyx b/python/cudf/cudf/_lib/groupby.pyx
index be5bb2741b4..a8b7fef6a57 100644
--- a/python/cudf/cudf/_lib/groupby.pyx
+++ b/python/cudf/cudf/_lib/groupby.pyx
@@ -1,12 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-from collections import defaultdict
-
-import numpy as np
 from pandas.core.groupby.groupby import DataError
 
-import rmm
-
 from cudf.api.types import (
     is_categorical_dtype,
     is_decimal_dtype,
@@ -15,6 +10,7 @@ from cudf.api.types import (
     is_string_dtype,
     is_struct_dtype,
 )
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -22,16 +18,9 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-import cudf
-
 from cudf._lib.column cimport Column
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.utils cimport (
-    columns_from_unique_ptr,
-    data_from_unique_ptr,
-    table_view_from_columns,
-    table_view_from_table,
-)
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 from cudf._lib.scalar import as_device_scalar
 
@@ -44,13 +33,11 @@ from cudf._lib.aggregation cimport (
     make_groupby_scan_aggregation,
 )
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.libcpp.functional cimport reference_wrapper
 from cudf._lib.cpp.replace cimport replace_policy
 from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.cpp.table.table cimport table, table_view
 from cudf._lib.cpp.types cimport size_type
-from cudf._lib.cpp.utilities.host_span cimport host_span
 
 # The sets below define the possible aggregations that can be performed on
 # different dtypes. These strings must be elements of the AggregationKind enum.
@@ -100,13 +87,17 @@ cdef class GroupBy:
 
     def __cinit__(self, list keys, bool dropna=True, *args, **kwargs):
         cdef libcudf_types.null_policy c_null_handling
+        cdef table_view keys_view
 
         if dropna:
             c_null_handling = libcudf_types.null_policy.EXCLUDE
         else:
             c_null_handling = libcudf_types.null_policy.INCLUDE
 
-        cdef table_view keys_view = table_view_from_columns(keys)
+        with acquire_spill_lock() as spill_lock:
+            keys_view = table_view_from_columns(keys)
+            # We spill lock the columns while this GroupBy instance is alive.
+            self._spill_lock = spill_lock
 
         with nogil:
             self.c_obj.reset(
@@ -121,13 +112,36 @@ cdef class GroupBy:
         self.dropna = dropna
 
     def groups(self, list values):
+        """
+        Perform a sort groupby, using ``self.keys`` as the key columns
+        and ``values`` as the value columns.
+
+        Parameters
+        ----------
+        values: list of Columns
+            The value columns
+
+        Returns
+        -------
+        grouped_keys: list of Columns
+            The grouped key columns
+        grouped_values: list of Columns
+            The grouped value columns
+        offsets: list of integers
+            Integer offsets such that offsets[i+1] - offsets[i]
+            represents the size of group `i`.
+        """
         cdef table_view values_view = table_view_from_columns(values)
 
         with nogil:
             c_groups = move(self.c_obj.get()[0].get_groups(values_view))
 
         grouped_key_cols = columns_from_unique_ptr(move(c_groups.keys))
-        grouped_value_cols = columns_from_unique_ptr(move(c_groups.values))
+
+        if values:
+            grouped_value_cols = columns_from_unique_ptr(move(c_groups.values))
+        else:
+            grouped_value_cols = []
         return grouped_key_cols, grouped_value_cols, c_groups.offsets
 
     def aggregate_internal(self, values, aggregations):
diff --git a/python/cudf/cudf/_lib/hash.pyx b/python/cudf/cudf/_lib/hash.pyx
index 1eba3a2f6b5..1264a9b2126 100644
--- a/python/cudf/cudf/_lib/hash.pyx
+++ b/python/cudf/cudf/_lib/hash.pyx
@@ -1,7 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint32_t
-from libcpp cimport bool
+from cudf.core.buffer import acquire_spill_lock
+
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
@@ -17,6 +17,7 @@ from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
+@acquire_spill_lock()
 def hash_partition(list source_columns, object columns_to_hash,
                    int num_partitions):
     cdef vector[libcudf_types.size_type] c_columns_to_hash = columns_to_hash
@@ -39,6 +40,7 @@ def hash_partition(list source_columns, object columns_to_hash,
     )
 
 
+@acquire_spill_lock()
 def hash(list source_columns, str method, int seed=0):
     cdef table_view c_source_view = table_view_from_columns(source_columns)
     cdef unique_ptr[column] c_result
diff --git a/python/cudf/cudf/_lib/interop.pyx b/python/cudf/cudf/_lib/interop.pyx
index ee5ce165f95..92840561563 100644
--- a/python/cudf/cudf/_lib/interop.pyx
+++ b/python/cudf/cudf/_lib/interop.pyx
@@ -1,11 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import cudf
-
 from cpython cimport pycapsule
-from libcpp cimport bool
 from libcpp.memory cimport shared_ptr, unique_ptr
-from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 from pyarrow.lib cimport CTable, pyarrow_unwrap_table, pyarrow_wrap_table
@@ -18,7 +14,6 @@ from cudf._lib.cpp.interop cimport (
     to_arrow as cpp_to_arrow,
     to_dlpack as cpp_to_dlpack,
 )
-from cudf._lib.cpp.io.types cimport column_in_metadata
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
diff --git a/python/cudf/cudf/_lib/io/CMakeLists.txt b/python/cudf/cudf/_lib/io/CMakeLists.txt
index b12b085fc76..5bc7bb77525 100644
--- a/python/cudf/cudf/_lib/io/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/io/CMakeLists.txt
@@ -17,14 +17,10 @@ set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX io_ ASSOCIATED_TARGETS cudf
 )
 
 set(targets_using_numpy io_datasource io_utils)
 foreach(target IN LISTS targets_using_numpy)
-  target_include_directories(${target} PRIVATE "${Python_NumPy_INCLUDE_DIRS}")
-endforeach()
-
-foreach(cython_module IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-  set_target_properties(${cython_module} PROPERTIES INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../cpp")
+  target_include_directories(${target} PRIVATE "${NumPy_INCLUDE_DIRS}")
 endforeach()
diff --git a/python/cudf/cudf/_lib/io/utils.pyx b/python/cudf/cudf/_lib/io/utils.pyx
index 18b26bb5aa6..7dbe395be79 100644
--- a/python/cudf/cudf/_lib/io/utils.pyx
+++ b/python/cudf/cudf/_lib/io/utils.pyx
@@ -2,9 +2,7 @@
 
 from cpython.buffer cimport PyBUF_READ
 from cpython.memoryview cimport PyMemoryView_FromMemory
-from libcpp.map cimport map
 from libcpp.memory cimport unique_ptr
-from libcpp.pair cimport pair
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
@@ -15,7 +13,6 @@ from cudf._lib.cpp.io.types cimport (
     data_sink,
     datasource,
     host_buffer,
-    io_type,
     sink_info,
     source_info,
 )
@@ -26,11 +23,10 @@ import errno
 import io
 import os
 
-import cudf
 from cudf.api.types import is_struct_dtype
 
 
-# Converts the Python source input to libcudf++ IO source_info
+# Converts the Python source input to libcudf IO source_info
 # with the appropriate type and source values
 cdef source_info make_source_info(list src) except*:
     if not src:
@@ -80,7 +76,7 @@ cdef source_info make_source_info(list src) except*:
 
     return source_info(c_host_buffers)
 
-# Converts the Python sink input to libcudf++ IO sink_info.
+# Converts the Python sink input to libcudf IO sink_info.
 cdef sink_info make_sinks_info(
     list src, vector[unique_ptr[data_sink]] & sink
 ) except*:
@@ -129,7 +125,7 @@ cdef sink_info make_sink_info(src, unique_ptr[data_sink] & sink) except*:
     return info
 
 
-# Adapts a python io.IOBase object as a libcudf++ IO data_sink. This lets you
+# Adapts a python io.IOBase object as a libcudf IO data_sink. This lets you
 # write from cudf to any python file-like object (File/BytesIO/SocketIO etc)
 cdef cppclass iobase_data_sink(data_sink):
     object buf
diff --git a/python/cudf/cudf/_lib/join.pyx b/python/cudf/cudf/_lib/join.pyx
index 1baef266dab..da03e8dcdd1 100644
--- a/python/cudf/cudf/_lib/join.pyx
+++ b/python/cudf/cudf/_lib/join.pyx
@@ -1,19 +1,14 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-from itertools import chain
+from cudf.core.buffer import acquire_spill_lock
 
-import cudf
-
-from libcpp cimport bool
 from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
-from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.join as cpp_join
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type, type_id
 from cudf._lib.utils cimport table_view_from_columns
@@ -21,7 +16,9 @@ from cudf._lib.utils cimport table_view_from_columns
 # The functions below return the *gathermaps* that represent
 # the join result when joining on the keys `lhs` and `rhs`.
 
-cpdef join(list lhs, list rhs, how=None):
+
+@acquire_spill_lock()
+def join(list lhs, list rhs, how=None):
     cdef pair[cpp_join.gather_map_type, cpp_join.gather_map_type] c_result
     cdef table_view c_lhs = table_view_from_columns(lhs)
     cdef table_view c_rhs = table_view_from_columns(rhs)
@@ -43,7 +40,8 @@ cpdef join(list lhs, list rhs, how=None):
     return left_rows, right_rows
 
 
-cpdef semi_join(list lhs, list rhs, how=None):
+@acquire_spill_lock()
+def semi_join(list lhs, list rhs, how=None):
     # left-semi and left-anti joins
     cdef cpp_join.gather_map_type c_result
     cdef table_view c_lhs = table_view_from_columns(lhs)
diff --git a/python/cudf/cudf/_lib/json.pyx b/python/cudf/cudf/_lib/json.pyx
index b0aafc275d6..533329bd856 100644
--- a/python/cudf/cudf/_lib/json.pyx
+++ b/python/cudf/cudf/_lib/json.pyx
@@ -2,7 +2,6 @@
 
 # cython: boundscheck = False
 
-
 import io
 import os
 from collections import abc
@@ -16,13 +15,12 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
-cimport cudf._lib.cpp.types as libcudf_types
 from cudf._lib.cpp.io.json cimport (
     json_reader_options,
     read_json as libcudf_read_json,
     schema_element,
 )
-from cudf._lib.cpp.types cimport data_type, size_type, type_id
+from cudf._lib.cpp.types cimport data_type, size_type
 from cudf._lib.io.utils cimport make_source_info, update_struct_field_names
 from cudf._lib.types cimport dtype_to_data_type
 from cudf._lib.utils cimport data_from_unique_ptr
@@ -46,7 +44,7 @@ cpdef read_json(object filepaths_or_buffers,
 
     # If input data is a JSON string (or StringIO), hold a reference to
     # the encoded memoryview externally to ensure the encoded buffer
-    # isn't destroyed before calling libcudf++ `read_json()`
+    # isn't destroyed before calling libcudf `read_json()`
     for idx in range(len(filepaths_or_buffers)):
         if isinstance(filepaths_or_buffers[idx], io.StringIO):
             filepaths_or_buffers[idx] = \
diff --git a/python/cudf/cudf/_lib/labeling.pyx b/python/cudf/cudf/_lib/labeling.pyx
index 088942064a8..2c2538ab0af 100644
--- a/python/cudf/cudf/_lib/labeling.pyx
+++ b/python/cudf/cudf/_lib/labeling.pyx
@@ -1,18 +1,12 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
-from enum import IntEnum
+from cudf.core.buffer import acquire_spill_lock
 
-import numpy as np
-
-from libc.stdint cimport uint32_t
 from libcpp cimport bool as cbool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-
-from cudf._lib.replace import replace_nulls
-
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.labeling cimport inclusive, label_bins as cpp_label_bins
@@ -21,6 +15,7 @@ from cudf._lib.cpp.labeling cimport inclusive, label_bins as cpp_label_bins
 # Note that the parameter input shadows a Python built-in in the local scope,
 # but I'm not too concerned about that since there's no use-case for actual
 # input in this context.
+@acquire_spill_lock()
 def label_bins(Column input, Column left_edges, cbool left_inclusive,
                Column right_edges, cbool right_inclusive):
     cdef inclusive c_left_inclusive = \
diff --git a/python/cudf/cudf/_lib/lists.pyx b/python/cudf/cudf/_lib/lists.pyx
index 581207c97a5..47e9dccc8e6 100644
--- a/python/cudf/cudf/_lib/lists.pyx
+++ b/python/cudf/cudf/_lib/lists.pyx
@@ -1,5 +1,7 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
+from cudf.core.buffer import acquire_spill_lock
+
 from libcpp cimport bool
 from libcpp.memory cimport make_shared, shared_ptr, unique_ptr
 from libcpp.utility cimport move
@@ -12,10 +14,12 @@ from cudf._lib.cpp.lists.combine cimport (
     concatenate_null_policy,
     concatenate_rows as cpp_concatenate_rows,
 )
+from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of
 from cudf._lib.cpp.lists.count_elements cimport (
     count_elements as cpp_count_elements,
 )
 from cudf._lib.cpp.lists.explode cimport explode_outer as cpp_explode_outer
+from cudf._lib.cpp.lists.extract cimport extract_list_element
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.cpp.lists.sorting cimport sort_lists as cpp_sort_lists
 from cudf._lib.cpp.lists.stream_compaction cimport distinct as cpp_distinct
@@ -26,23 +30,14 @@ from cudf._lib.cpp.types cimport (
     nan_equality,
     null_equality,
     null_order,
-    null_policy,
     order,
     size_type,
 )
 from cudf._lib.scalar cimport DeviceScalar
-from cudf._lib.types cimport (
-    underlying_type_t_null_order,
-    underlying_type_t_order,
-)
-
-from cudf.core.dtypes import ListDtype
-
-from cudf._lib.cpp.lists.contains cimport contains, index_of as cpp_index_of
-from cudf._lib.cpp.lists.extract cimport extract_list_element
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
+@acquire_spill_lock()
 def count_elements(Column col):
 
     # shared_ptr required because lists_column_view has no default
@@ -59,6 +54,7 @@ def count_elements(Column col):
     return result
 
 
+@acquire_spill_lock()
 def explode_outer(
     list source_columns, int explode_column_idx
 ):
@@ -73,6 +69,7 @@ def explode_outer(
     return columns_from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     """
     nulls_equal == True indicates that libcudf should treat any two nulls as
@@ -101,6 +98,7 @@ def distinct(Column col, bool nulls_equal, bool nans_all_equal):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def sort_lists(Column col, bool ascending, str na_position):
     cdef shared_ptr[lists_column_view] list_view = (
         make_shared[lists_column_view](col.view())
@@ -122,6 +120,7 @@ def sort_lists(Column col, bool ascending, str na_position):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def extract_element_scalar(Column col, size_type index):
     # shared_ptr required because lists_column_view has no default
     # ctor
@@ -138,6 +137,7 @@ def extract_element_scalar(Column col, size_type index):
     return result
 
 
+@acquire_spill_lock()
 def extract_element_column(Column col, Column index):
     cdef shared_ptr[lists_column_view] list_view = (
         make_shared[lists_column_view](col.view())
@@ -154,6 +154,7 @@ def extract_element_column(Column col, Column index):
     return result
 
 
+@acquire_spill_lock()
 def contains_scalar(Column col, object py_search_key):
 
     cdef DeviceScalar search_key = py_search_key.device_value
@@ -174,6 +175,7 @@ def contains_scalar(Column col, object py_search_key):
     return result
 
 
+@acquire_spill_lock()
 def index_of_scalar(Column col, object py_search_key):
 
     cdef DeviceScalar search_key = py_search_key.device_value
@@ -193,6 +195,7 @@ def index_of_scalar(Column col, object py_search_key):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def index_of_column(Column col, Column search_keys):
 
     cdef column_view keys_view = search_keys.view()
@@ -211,6 +214,7 @@ def index_of_column(Column col, Column search_keys):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def concatenate_rows(list source_columns):
     cdef unique_ptr[column] c_result
 
@@ -224,6 +228,7 @@ def concatenate_rows(list source_columns):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def concatenate_list_elements(Column input_column, dropna=False):
     cdef concatenate_null_policy policy = (
         concatenate_null_policy.IGNORE if dropna
diff --git a/python/cudf/cudf/_lib/merge.pyx b/python/cudf/cudf/_lib/merge.pyx
index dae2c466266..935d8c69adc 100644
--- a/python/cudf/cudf/_lib/merge.pyx
+++ b/python/cudf/cudf/_lib/merge.pyx
@@ -6,7 +6,6 @@ from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 cimport cudf._lib.cpp.types as libcudf_types
-from cudf._lib.column cimport Column
 from cudf._lib.cpp.merge cimport merge as cpp_merge
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
diff --git a/python/cudf/cudf/_lib/null_mask.pyx b/python/cudf/cudf/_lib/null_mask.pyx
index b0ee28baf29..c41ae98b9bd 100644
--- a/python/cudf/cudf/_lib/null_mask.pyx
+++ b/python/cudf/cudf/_lib/null_mask.pyx
@@ -2,22 +2,27 @@
 
 from enum import Enum
 
+from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
+
+from cudf.core.buffer import acquire_spill_lock, as_buffer
+
 from libcpp.memory cimport make_unique, unique_ptr
+from libcpp.pair cimport pair
 from libcpp.utility cimport move
 
-from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
-
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.null_mask cimport (
     bitmask_allocation_size_bytes as cpp_bitmask_allocation_size_bytes,
+    bitmask_and as cpp_bitmask_and,
+    bitmask_or as cpp_bitmask_or,
     copy_bitmask as cpp_copy_bitmask,
     create_null_mask as cpp_create_null_mask,
     underlying_type_t_mask_state,
 )
+from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport mask_state, size_type
-
-from cudf.core.buffer import as_device_buffer_like
+from cudf._lib.utils cimport table_view_from_columns
 
 
 class MaskState(Enum):
@@ -30,6 +35,7 @@ class MaskState(Enum):
     ALL_NULL = <underlying_type_t_mask_state> mask_state.ALL_NULL
 
 
+@acquire_spill_lock()
 def copy_bitmask(Column col):
     """
     Copies column's validity mask buffer into a new buffer, shifting by the
@@ -47,7 +53,7 @@ def copy_bitmask(Column col):
         up_db = make_unique[device_buffer](move(db))
 
     rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
-    buf = as_device_buffer_like(rmm_db)
+    buf = as_buffer(rmm_db)
     return buf
 
 
@@ -93,5 +99,31 @@ def create_null_mask(size_type size, state=MaskState.UNINITIALIZED):
         up_db = make_unique[device_buffer](move(db))
 
     rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
-    buf = as_device_buffer_like(rmm_db)
+    buf = as_buffer(rmm_db)
     return buf
+
+
+@acquire_spill_lock()
+def bitmask_and(columns: list):
+    cdef table_view c_view = table_view_from_columns(columns)
+    cdef pair[device_buffer, size_type] c_result
+    cdef unique_ptr[device_buffer] up_db
+    with nogil:
+        c_result = move(cpp_bitmask_and(c_view))
+        up_db = make_unique[device_buffer](move(c_result.first))
+    dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db))
+    buf = as_buffer(dbuf)
+    return buf, c_result.second
+
+
+@acquire_spill_lock()
+def bitmask_or(columns: list):
+    cdef table_view c_view = table_view_from_columns(columns)
+    cdef pair[device_buffer, size_type] c_result
+    cdef unique_ptr[device_buffer] up_db
+    with nogil:
+        c_result = move(cpp_bitmask_or(c_view))
+        up_db = make_unique[device_buffer](move(c_result.first))
+    dbuf = DeviceBuffer.c_from_unique_ptr(move(up_db))
+    buf = as_buffer(dbuf)
+    return buf, c_result.second
diff --git a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
index d96999a077e..3b925fb5548 100644
--- a/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/nvtext/CMakeLists.txt
@@ -19,9 +19,5 @@ set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX nvtext_ ASSOCIATED_TARGETS cudf
 )
-
-foreach(cython_module IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-  set_target_properties(${cython_module} PROPERTIES INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../cpp")
-endforeach()
diff --git a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
index f1e15570e9f..984c8e84d7c 100644
--- a/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
+++ b/python/cudf/cudf/_lib/nvtext/edit_distance.pyx
@@ -1,6 +1,7 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
-from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
@@ -13,6 +14,7 @@ from cudf._lib.cpp.nvtext.edit_distance cimport (
 )
 
 
+@acquire_spill_lock()
 def edit_distance(Column strings, Column targets):
     cdef column_view c_strings = strings.view()
     cdef column_view c_targets = targets.view()
@@ -24,6 +26,7 @@ def edit_distance(Column strings, Column targets):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def edit_distance_matrix(Column strings):
     cdef column_view c_strings = strings.view()
     cdef unique_ptr[column] c_result
diff --git a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
index 5fcec570dcb..7be3b0f7c03 100644
--- a/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
+++ b/python/cudf/cudf/_lib/nvtext/generate_ngrams.pyx
@@ -1,4 +1,6 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -15,6 +17,7 @@ from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def generate_ngrams(Column strings, int ngrams, object py_separator):
 
     cdef DeviceScalar separator = py_separator.device_value
@@ -37,6 +40,7 @@ def generate_ngrams(Column strings, int ngrams, object py_separator):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def generate_character_ngrams(Column strings, int ngrams):
     cdef column_view c_strings = strings.view()
     cdef size_type c_ngrams = ngrams
diff --git a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
index 1e9e0e39ff1..3e7911c8ae8 100644
--- a/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/ngrams_tokenize.pyx
@@ -1,4 +1,6 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -14,6 +16,7 @@ from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def ngrams_tokenize(
     Column strings,
     int ngrams,
diff --git a/python/cudf/cudf/_lib/nvtext/normalize.pyx b/python/cudf/cudf/_lib/nvtext/normalize.pyx
index e475f0cd996..80c6ef792ab 100644
--- a/python/cudf/cudf/_lib/nvtext/normalize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/normalize.pyx
@@ -1,4 +1,6 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -13,6 +15,7 @@ from cudf._lib.cpp.nvtext.normalize cimport (
 )
 
 
+@acquire_spill_lock()
 def normalize_spaces(Column strings):
     cdef column_view c_strings = strings.view()
     cdef unique_ptr[column] c_result
@@ -23,6 +26,7 @@ def normalize_spaces(Column strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def normalize_characters(Column strings, bool do_lower=True):
     cdef column_view c_strings = strings.view()
     cdef unique_ptr[column] c_result
diff --git a/python/cudf/cudf/_lib/nvtext/replace.pyx b/python/cudf/cudf/_lib/nvtext/replace.pyx
index b4f37ac3ec7..289e5611010 100644
--- a/python/cudf/cudf/_lib/nvtext/replace.pyx
+++ b/python/cudf/cudf/_lib/nvtext/replace.pyx
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -15,6 +17,7 @@ from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def replace_tokens(Column strings,
                    Column targets,
                    Column replacements,
@@ -49,6 +52,7 @@ def replace_tokens(Column strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def filter_tokens(Column strings,
                   size_type min_token_length,
                   object py_replacement,
diff --git a/python/cudf/cudf/_lib/nvtext/stemmer.pyx b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
index 89d4b07b7ad..7a76052ffe4 100644
--- a/python/cudf/cudf/_lib/nvtext/stemmer.pyx
+++ b/python/cudf/cudf/_lib/nvtext/stemmer.pyx
@@ -1,4 +1,6 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -22,6 +24,7 @@ class LetterType(IntEnum):
     VOWEL = <underlying_type_t_letter_type> letter_type.VOWEL
 
 
+@acquire_spill_lock()
 def porter_stemmer_measure(Column strings):
     cdef column_view c_strings = strings.view()
     cdef unique_ptr[column] c_result
@@ -32,6 +35,7 @@ def porter_stemmer_measure(Column strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_letter(Column strings,
               object ltype,
               size_type index):
@@ -47,6 +51,7 @@ def is_letter(Column strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_letter_multi(Column strings,
                     object ltype,
                     Column indices):
diff --git a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
index 426744ee46c..dbd23d91cc5 100644
--- a/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/subword_tokenize.pyx
@@ -1,13 +1,15 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
+from libc.stdint cimport uint32_t
+
+from cudf.core.buffer import acquire_spill_lock
 
-from libc.stdint cimport uint32_t, uintptr_t
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.nvtext.subword_tokenize cimport (
     hashed_vocabulary as cpp_hashed_vocabulary,
@@ -27,6 +29,7 @@ cdef class Hashed_Vocabulary:
             self.c_obj = move(cpp_load_vocabulary_file(c_hash_file))
 
 
+@acquire_spill_lock()
 def subword_tokenize_inmem_hash(
     Column strings,
     Hashed_Vocabulary hashed_vocabulary,
diff --git a/python/cudf/cudf/_lib/nvtext/tokenize.pyx b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
index 5fc852c2ab0..2bb4fa8e108 100644
--- a/python/cudf/cudf/_lib/nvtext/tokenize.pyx
+++ b/python/cudf/cudf/_lib/nvtext/tokenize.pyx
@@ -1,4 +1,6 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -13,10 +15,10 @@ from cudf._lib.cpp.nvtext.tokenize cimport (
     tokenize as cpp_tokenize,
 )
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
-from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def _tokenize_scalar(Column strings, object py_delimiter):
 
     cdef DeviceScalar delimiter = py_delimiter.device_value
@@ -37,6 +39,7 @@ def _tokenize_scalar(Column strings, object py_delimiter):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def _tokenize_column(Column strings, Column delimiters):
     cdef column_view c_strings = strings.view()
     cdef column_view c_delimiters = delimiters.view()
@@ -53,6 +56,7 @@ def _tokenize_column(Column strings, Column delimiters):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def _count_tokens_scalar(Column strings, object py_delimiter):
 
     cdef DeviceScalar delimiter = py_delimiter.device_value
@@ -73,6 +77,7 @@ def _count_tokens_scalar(Column strings, object py_delimiter):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def _count_tokens_column(Column strings, Column delimiters):
     cdef column_view c_strings = strings.view()
     cdef column_view c_delimiters = delimiters.view()
@@ -89,6 +94,7 @@ def _count_tokens_column(Column strings, Column delimiters):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def character_tokenize(Column strings):
     cdef column_view c_strings = strings.view()
     cdef unique_ptr[column] c_result
@@ -100,6 +106,7 @@ def character_tokenize(Column strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def detokenize(Column strings, Column indices, object py_separator):
 
     cdef DeviceScalar separator = py_separator.device_value
diff --git a/python/cudf/cudf/_lib/orc.pyx b/python/cudf/cudf/_lib/orc.pyx
index be7b29da515..cb364c86dd6 100644
--- a/python/cudf/cudf/_lib/orc.pyx
+++ b/python/cudf/cudf/_lib/orc.pyx
@@ -8,6 +8,7 @@ from libcpp.memory cimport make_unique, unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
+
 from collections import OrderedDict
 
 cimport cudf._lib.cpp.lists.lists_column_view as cpp_lists_column_view
@@ -19,7 +20,6 @@ except ImportError:
 
 cimport cudf._lib.cpp.io.types as cudf_io_types
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.io.orc cimport (
     chunked_orc_writer_options,
     orc_chunked_writer,
@@ -34,7 +34,6 @@ from cudf._lib.cpp.io.orc_metadata cimport (
 )
 from cudf._lib.cpp.io.types cimport (
     column_in_metadata,
-    column_name_info,
     compression_type,
     data_sink,
     sink_info,
@@ -49,20 +48,12 @@ from cudf._lib.io.utils cimport (
     make_sink_info,
     make_source_info,
     update_column_struct_field_names,
-    update_struct_field_names,
 )
 
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 
 from cudf._lib.types cimport underlying_type_t_type_id
-
-import numpy as np
-
-from cudf._lib.utils cimport (
-    data_from_unique_ptr,
-    get_column_names,
-    table_view_from_table,
-)
+from cudf._lib.utils cimport data_from_unique_ptr, table_view_from_table
 
 from pyarrow.lib import NativeFile
 
@@ -129,15 +120,15 @@ cpdef read_orc(object filepaths_or_buffers,
         c_result = move(libcudf_read_orc(c_orc_reader_options))
 
     names = [name.decode() for name in c_result.metadata.column_names]
-    actual_index_names, names, is_range_index, reset_index_name, range_idx = \
-        _get_index_from_metadata(c_result.metadata.user_data,
-                                 names,
-                                 skip_rows,
-                                 num_rows)
+    actual_index_names, col_names, is_range_index, reset_index_name, \
+        range_idx = _get_index_from_metadata(c_result.metadata.user_data,
+                                             names,
+                                             skip_rows,
+                                             num_rows)
 
     data, index = data_from_unique_ptr(
         move(c_result.tbl),
-        names,
+        col_names if columns is None else names,
         actual_index_names
     )
 
@@ -247,9 +238,10 @@ cpdef write_orc(table,
                 object stripe_size_bytes=None,
                 object stripe_size_rows=None,
                 object row_index_stride=None,
-                object cols_as_map_type=None):
+                object cols_as_map_type=None,
+                object index=None):
     """
-    Cython function to call into libcudf API, see `write_orc`.
+    Cython function to call into libcudf API, see `cudf::io::write_orc`.
 
     See Also
     --------
@@ -261,10 +253,12 @@ cpdef write_orc(table,
     cdef unique_ptr[table_input_metadata] tbl_meta
     cdef map[string, string] user_data
     user_data[str.encode("pandas")] = str.encode(generate_pandas_metadata(
-        table, None)
+        table, index)
     )
 
-    if not isinstance(table._index, cudf.RangeIndex):
+    if index is True or (
+        index is None and not isinstance(table._index, cudf.RangeIndex)
+    ):
         tv = table_view_from_table(table)
         tbl_meta = make_unique[table_input_metadata](tv)
         for level, idx_name in enumerate(table._index.names):
diff --git a/python/cudf/cudf/_lib/parquet.pyx b/python/cudf/cudf/_lib/parquet.pyx
index 891f259a828..2667279e205 100644
--- a/python/cudf/cudf/_lib/parquet.pyx
+++ b/python/cudf/cudf/_lib/parquet.pyx
@@ -2,9 +2,7 @@
 
 # cython: boundscheck = False
 
-import errno
 import io
-import os
 
 import pyarrow as pa
 
@@ -20,20 +18,17 @@ import numpy as np
 from cython.operator cimport dereference
 
 from cudf.api.types import (
-    is_categorical_dtype,
     is_decimal_dtype,
     is_list_dtype,
     is_list_like,
     is_struct_dtype,
 )
-from cudf.utils.dtypes import np_to_pa_dtype
 
-from cudf._lib.utils cimport data_from_unique_ptr, get_column_names
+from cudf._lib.utils cimport data_from_unique_ptr
 
 from cudf._lib.utils import _index_level_name, generate_pandas_metadata
 
 from libc.stdint cimport uint8_t
-from libc.stdlib cimport free
 from libcpp cimport bool
 from libcpp.map cimport map
 from libcpp.memory cimport make_unique, unique_ptr
@@ -47,7 +42,6 @@ cimport cudf._lib.cpp.types as cudf_types
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.io.parquet cimport (
     chunked_parquet_writer_options,
-    chunked_parquet_writer_options_builder,
     merge_row_group_metadata as parquet_merge_metadata,
     parquet_chunked_writer as cpp_parquet_chunked_writer,
     parquet_reader_options,
@@ -59,9 +53,8 @@ from cudf._lib.cpp.io.types cimport column_in_metadata, table_input_metadata
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport data_type, size_type
-from cudf._lib.io.datasource cimport Datasource, NativeFileDatasource
+from cudf._lib.io.datasource cimport NativeFileDatasource
 from cudf._lib.io.utils cimport (
-    make_sink_info,
     make_sinks_info,
     make_source_info,
     update_struct_field_names,
@@ -70,6 +63,8 @@ from cudf._lib.utils cimport table_view_from_table
 
 from pyarrow.lib import NativeFile
 
+from cudf.utils.ioutils import _ROW_GROUP_SIZE_BYTES_DEFAULT
+
 
 cdef class BufferArrayFromVector:
     cdef Py_ssize_t length
@@ -319,7 +314,7 @@ cpdef write_parquet(
         object statistics="ROWGROUP",
         object metadata_file_path=None,
         object int96_timestamps=False,
-        object row_group_size_bytes=None,
+        object row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
         object row_group_size_rows=None,
         object max_page_size_bytes=None,
         object max_page_size_rows=None,
@@ -488,7 +483,7 @@ cdef class ParquetWriter:
 
     def __cinit__(self, object filepath_or_buffer, object index=None,
                   object compression="snappy", str statistics="ROWGROUP",
-                  int row_group_size_bytes=134217728,
+                  int row_group_size_bytes=_ROW_GROUP_SIZE_BYTES_DEFAULT,
                   int row_group_size_rows=1000000,
                   int max_page_size_bytes=524288,
                   int max_page_size_rows=20000):
diff --git a/python/cudf/cudf/_lib/partitioning.pyx b/python/cudf/cudf/_lib/partitioning.pyx
index f2f5a92aca1..083407954b3 100644
--- a/python/cudf/cudf/_lib/partitioning.pyx
+++ b/python/cudf/cudf/_lib/partitioning.pyx
@@ -1,6 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-from libcpp cimport bool
+from cudf.core.buffer import acquire_spill_lock
+
 from libcpp.memory cimport unique_ptr
 from libcpp.pair cimport pair
 from libcpp.utility cimport move
@@ -17,6 +18,7 @@ from cudf._lib.stream_compaction import distinct_count as cpp_distinct_count
 cimport cudf._lib.cpp.types as libcudf_types
 
 
+@acquire_spill_lock()
 def partition(list source_columns, Column partition_map,
               object num_partitions):
 
diff --git a/python/cudf/cudf/_lib/quantiles.pyx b/python/cudf/cudf/_lib/quantiles.pyx
index f65c29a55a8..d3a02fa7cbf 100644
--- a/python/cudf/cudf/_lib/quantiles.pyx
+++ b/python/cudf/cudf/_lib/quantiles.pyx
@@ -1,12 +1,13 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+from cudf.core.buffer import acquire_spill_lock
+
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
 from cudf._lib.column cimport Column
-from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.types cimport (
     underlying_type_t_interpolation,
     underlying_type_t_null_order,
@@ -20,20 +21,15 @@ from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.quantiles cimport (
     quantile as cpp_quantile,
-    quantiles as cpp_quantiles,
+    quantiles as cpp_quantile_table,
 )
 from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport (
-    interpolation,
-    null_order,
-    order,
-    order_info,
-    sorted,
-)
+from cudf._lib.cpp.types cimport interpolation, null_order, order, sorted
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
+@acquire_spill_lock()
 def quantile(
     Column input,
     object q,
@@ -74,12 +70,14 @@ def quantile(
     return Column.from_unique_ptr(move(c_result))
 
 
-def quantiles(list source_columns,
-              vector[double] q,
-              object interp,
-              object is_input_sorted,
-              list column_order,
-              list null_precedence):
+def quantile_table(
+    list source_columns,
+    vector[double] q,
+    object interp,
+    object is_input_sorted,
+    list column_order,
+    list null_precedence,
+):
     cdef table_view c_input = table_view_from_columns(source_columns)
     cdef vector[double] c_q = q
     cdef interpolation c_interp = <interpolation>(
@@ -108,13 +106,13 @@ def quantiles(list source_columns,
 
     with nogil:
         c_result = move(
-            cpp_quantiles(
+            cpp_quantile_table(
                 c_input,
                 c_q,
                 c_interp,
                 c_is_input_sorted,
                 c_column_order,
-                c_null_precedence
+                c_null_precedence,
             )
         )
 
diff --git a/python/cudf/cudf/_lib/reduce.pyx b/python/cudf/cudf/_lib/reduce.pyx
index bdbe7e1c668..f11bacd5d1e 100644
--- a/python/cudf/cudf/_lib/reduce.pyx
+++ b/python/cudf/cudf/_lib/reduce.pyx
@@ -1,17 +1,9 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import cudf
-from cudf.api.types import is_decimal_dtype
+from cython.operator import dereference
 
-from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.reduce cimport cpp_minmax, cpp_reduce, cpp_scan, scan_type
-from cudf._lib.cpp.scalar.scalar cimport scalar
-from cudf._lib.cpp.types cimport data_type, type_id
-from cudf._lib.scalar cimport DeviceScalar
-
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
+import cudf
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move, pair
@@ -22,20 +14,20 @@ from cudf._lib.aggregation cimport (
     make_reduce_aggregation,
     make_scan_aggregation,
 )
-from cudf._lib.types cimport (
-    dtype_to_data_type,
-    is_decimal_type_id,
-    underlying_type_t_type_id,
-)
-
-import numpy as np
-
-cimport cudf._lib.cpp.types as libcudf_types
+from cudf._lib.column cimport Column
+from cudf._lib.cpp.column.column cimport column
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.reduce cimport cpp_minmax, cpp_reduce, cpp_scan, scan_type
+from cudf._lib.cpp.scalar.scalar cimport scalar
+from cudf._lib.cpp.types cimport data_type
+from cudf._lib.scalar cimport DeviceScalar
+from cudf._lib.types cimport dtype_to_data_type, is_decimal_type_id
 
 
+@acquire_spill_lock()
 def reduce(reduction_op, Column incol, dtype=None, **kwargs):
     """
-    Top level Cython reduce function wrapping libcudf++ reductions.
+    Top level Cython reduce function wrapping libcudf reductions.
 
     Parameters
     ----------
@@ -74,7 +66,7 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
     with nogil:
         c_result = move(cpp_reduce(
             c_incol_view,
-            cython_agg.c_obj,
+            dereference(cython_agg.c_obj),
             c_out_dtype
         ))
 
@@ -89,9 +81,10 @@ def reduce(reduction_op, Column incol, dtype=None, **kwargs):
     return py_result.value
 
 
+@acquire_spill_lock()
 def scan(scan_op, Column incol, inclusive, **kwargs):
     """
-    Top level Cython scan function wrapping libcudf++ scans.
+    Top level Cython scan function wrapping libcudf scans.
 
     Parameters
     ----------
@@ -112,7 +105,7 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
     with nogil:
         c_result = move(cpp_scan(
             c_incol_view,
-            cython_agg.c_obj,
+            dereference(cython_agg.c_obj),
             c_inclusive
         ))
 
@@ -120,9 +113,10 @@ def scan(scan_op, Column incol, inclusive, **kwargs):
     return py_result
 
 
+@acquire_spill_lock()
 def minmax(Column incol):
     """
-    Top level Cython minmax function wrapping libcudf++ minmax.
+    Top level Cython minmax function wrapping libcudf minmax.
 
     Parameters
     ----------
diff --git a/python/cudf/cudf/_lib/replace.pyx b/python/cudf/cudf/_lib/replace.pyx
index e4311b356ec..c763a86d6e5 100644
--- a/python/cudf/cudf/_lib/replace.pyx
+++ b/python/cudf/cudf/_lib/replace.pyx
@@ -1,9 +1,10 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf.api.types import is_scalar
+from cudf.core.buffer import acquire_spill_lock
 
 from cudf._lib.column cimport Column
 
@@ -22,6 +23,7 @@ from cudf._lib.cpp.scalar.scalar cimport scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def replace(Column input_col, Column values_to_replace,
             Column replacement_values):
     """
@@ -48,6 +50,7 @@ def replace(Column input_col, Column values_to_replace,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def replace_nulls_column(Column input_col, Column replacement_values):
     """
     Replaces null values in input_col with corresponding values from
@@ -70,6 +73,7 @@ def replace_nulls_column(Column input_col, Column replacement_values):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def replace_nulls_scalar(Column input_col, DeviceScalar replacement_value):
     """
     Replaces null values in input_col with replacement_value
@@ -92,6 +96,7 @@ def replace_nulls_scalar(Column input_col, DeviceScalar replacement_value):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def replace_nulls_fill(Column input_col, object method):
     """
     Replaces null values in input_col with replacement_value
@@ -145,6 +150,7 @@ def replace_nulls(
         return replace_nulls_column(input_col, replacement)
 
 
+@acquire_spill_lock()
 def clamp(Column input_col, DeviceScalar lo, DeviceScalar lo_replace,
           DeviceScalar hi, DeviceScalar hi_replace):
     """
@@ -175,6 +181,7 @@ def clamp(Column input_col, DeviceScalar lo, DeviceScalar lo_replace,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def clamp(Column input_col, DeviceScalar lo, DeviceScalar hi):
     """
     Clip the input_col such that values < lo will be replaced by lo
@@ -198,6 +205,7 @@ def clamp(Column input_col, DeviceScalar lo, DeviceScalar hi):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def clip(Column input_col, object lo, object hi):
     """
     Clip the input_col such that values < lo will be replaced by lo
@@ -210,6 +218,7 @@ def clip(Column input_col, object lo, object hi):
     return clamp(input_col, lo_scalar, hi_scalar)
 
 
+@acquire_spill_lock()
 def normalize_nans_and_zeros_inplace(Column input_col):
     """
     Inplace normalizing
@@ -220,6 +229,7 @@ def normalize_nans_and_zeros_inplace(Column input_col):
         cpp_normalize_nans_and_zeros(input_col_view)
 
 
+@acquire_spill_lock()
 def normalize_nans_and_zeros_column(Column input_col):
     """
     Returns a new  normalized Column
diff --git a/python/cudf/cudf/_lib/reshape.pyx b/python/cudf/cudf/_lib/reshape.pyx
index 29223947eea..c237b7b1389 100644
--- a/python/cudf/cudf/_lib/reshape.pyx
+++ b/python/cudf/cudf/_lib/reshape.pyx
@@ -1,11 +1,12 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
+from cudf.core.buffer import acquire_spill_lock
+
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.reshape cimport (
     interleave_columns as cpp_interleave_columns,
     tile as cpp_tile,
@@ -16,6 +17,7 @@ from cudf._lib.cpp.types cimport size_type
 from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
+@acquire_spill_lock()
 def interleave_columns(list source_columns):
     cdef table_view c_view = table_view_from_columns(source_columns)
     cdef unique_ptr[column] c_result
@@ -26,6 +28,7 @@ def interleave_columns(list source_columns):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def tile(list source_columns, size_type count):
     cdef size_type c_count = count
     cdef table_view c_view = table_view_from_columns(source_columns)
diff --git a/python/cudf/cudf/_lib/rolling.pyx b/python/cudf/cudf/_lib/rolling.pyx
index d12b096bc08..8c4751e3084 100644
--- a/python/cudf/cudf/_lib/rolling.pyx
+++ b/python/cudf/cudf/_lib/rolling.pyx
@@ -1,8 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import pandas as pd
-
-import cudf
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -15,6 +13,7 @@ from cudf._lib.cpp.rolling cimport rolling_window as cpp_rolling_window
 from cudf._lib.cpp.types cimport size_type
 
 
+@acquire_spill_lock()
 def rolling(Column source_column,
             Column pre_column_window,
             Column fwd_column_window,
diff --git a/python/cudf/cudf/_lib/round.pyx b/python/cudf/cudf/_lib/round.pyx
index c5c565561a9..7eddb1b8cbd 100644
--- a/python/cudf/cudf/_lib/round.pyx
+++ b/python/cudf/cudf/_lib/round.pyx
@@ -1,4 +1,6 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -12,6 +14,7 @@ from cudf._lib.cpp.round cimport (
 )
 
 
+@acquire_spill_lock()
 def round(Column input_col, int decimal_places=0, how="half_even"):
     """
     Round column values to the given number of decimal places
diff --git a/python/cudf/cudf/_lib/scalar.pyx b/python/cudf/cudf/_lib/scalar.pyx
index 9b422b77eeb..af63964bac3 100644
--- a/python/cudf/cudf/_lib/scalar.pyx
+++ b/python/cudf/cudf/_lib/scalar.pyx
@@ -69,12 +69,7 @@ from cudf._lib.cpp.wrappers.timestamps cimport (
     timestamp_s,
     timestamp_us,
 )
-from cudf._lib.utils cimport (
-    columns_from_table_view,
-    data_from_table_view,
-    table_view_from_columns,
-    table_view_from_table,
-)
+from cudf._lib.utils cimport columns_from_table_view, table_view_from_columns
 
 
 # The DeviceMemoryResource attribute could be released prematurely
diff --git a/python/cudf/cudf/_lib/search.pyx b/python/cudf/cudf/_lib/search.pyx
index d5568f53231..fef3a08c6d7 100644
--- a/python/cudf/cudf/_lib/search.pyx
+++ b/python/cudf/cudf/_lib/search.pyx
@@ -1,5 +1,7 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+from cudf.core.buffer import acquire_spill_lock
+
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
@@ -13,6 +15,7 @@ from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.utils cimport table_view_from_columns
 
 
+@acquire_spill_lock()
 def search_sorted(
     list source, list values, side, ascending=True, na_position="last"
 ):
@@ -24,9 +27,9 @@ def search_sorted(
         List of columns to search in
     values : List of columns
         List of value columns to search for
-    side : str {‘left’, ‘right’} optional
-        If ‘left’, the index of the first suitable location is given.
-        If ‘right’, return the last such index
+    side : str {'left', 'right'} optional
+        If 'left', the index of the first suitable location is given.
+        If 'right', return the last such index
     """
     cdef unique_ptr[column] c_result
     cdef vector[libcudf_types.order] c_column_order
@@ -73,6 +76,7 @@ def search_sorted(
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def contains(Column haystack, Column needles):
     """Check whether column contains multiple values
 
diff --git a/python/cudf/cudf/_lib/sort.pyx b/python/cudf/cudf/_lib/sort.pyx
index 8074c7c5d3a..3b96cc618dd 100644
--- a/python/cudf/cudf/_lib/sort.pyx
+++ b/python/cudf/cudf/_lib/sort.pyx
@@ -1,12 +1,12 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+from cudf.core.buffer import acquire_spill_lock
+
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
-from enum import IntEnum
-
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.aggregation cimport (
     rank_method,
@@ -20,12 +20,12 @@ from cudf._lib.cpp.sorting cimport (
     rank,
     sorted_order,
 )
-from cudf._lib.cpp.table.table cimport table
 from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport null_order, null_policy, order
-from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
+from cudf._lib.utils cimport table_view_from_columns
 
 
+@acquire_spill_lock()
 def is_sorted(
     list source_columns, object ascending=None, object null_position=None
 ):
@@ -101,6 +101,7 @@ def is_sorted(
     return c_result
 
 
+@acquire_spill_lock()
 def order_by(list columns_from_table, object ascending, str na_position):
     """
     Get index to sort the table in ascending/descending order.
@@ -142,6 +143,7 @@ def order_by(list columns_from_table, object ascending, str na_position):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def digitize(list source_columns, list bins, bool right=False):
     """
     Return the indices of the bins to which each value in source_table belongs.
@@ -192,14 +194,13 @@ def digitize(list source_columns, list bins, bool right=False):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def rank_columns(list source_columns, object method, str na_option,
                  bool ascending, bool pct
                  ):
     """
     Compute numerical data ranks (1 through n) of each column in the dataframe
     """
-    cdef table_view source_table_view = table_view_from_columns(source_columns)
-
     cdef rank_method c_rank_method = < rank_method > (
         < underlying_type_t_rank_method > method
     )
diff --git a/python/cudf/cudf/_lib/stream_compaction.pyx b/python/cudf/cudf/_lib/stream_compaction.pyx
index b645fcd59d0..143999e52ef 100644
--- a/python/cudf/cudf/_lib/stream_compaction.pyx
+++ b/python/cudf/cudf/_lib/stream_compaction.pyx
@@ -1,6 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import pandas as pd
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -27,14 +27,10 @@ from cudf._lib.cpp.types cimport (
     order,
     size_type,
 )
-from cudf._lib.utils cimport (
-    columns_from_unique_ptr,
-    data_from_unique_ptr,
-    table_view_from_columns,
-    table_view_from_table,
-)
+from cudf._lib.utils cimport columns_from_unique_ptr, table_view_from_columns
 
 
+@acquire_spill_lock()
 def drop_nulls(list columns, how="any", keys=None, thresh=None):
     """
     Drops null rows from cols depending on key columns.
@@ -78,6 +74,7 @@ def drop_nulls(list columns, how="any", keys=None, thresh=None):
     return columns_from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def apply_boolean_mask(list columns, Column boolean_mask):
     """
     Drops the rows which correspond to False in boolean_mask.
@@ -107,6 +104,7 @@ def apply_boolean_mask(list columns, Column boolean_mask):
     return columns_from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def drop_duplicates(list columns,
                     object keys=None,
                     object keep='first',
@@ -191,6 +189,7 @@ def drop_duplicates(list columns,
     return columns_from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def distinct_count(Column source_column, ignore_nulls=True, nan_as_null=False):
     """
     Finds number of unique rows in `source_column`
diff --git a/python/cudf/cudf/_lib/string_casting.pyx b/python/cudf/cudf/_lib/string_casting.pyx
index f9e98efbbd9..3b29255e4b6 100644
--- a/python/cudf/cudf/_lib/string_casting.pyx
+++ b/python/cudf/cudf/_lib/string_casting.pyx
@@ -1,6 +1,4 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
-
-import numpy as np
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from cudf._lib.column cimport Column
 
@@ -46,10 +44,6 @@ from cudf._lib.cpp.strings.convert.convert_ipv4 cimport (
     ipv4_to_integers as cpp_ipv4_to_integers,
     is_ipv4 as cpp_is_ipv4,
 )
-from cudf._lib.cpp.strings.convert.convert_urls cimport (
-    url_decode as cpp_url_decode,
-    url_encode as cpp_url_encode,
-)
 from cudf._lib.cpp.types cimport data_type, type_id
 from cudf._lib.types cimport underlying_type_t_type_id
 
diff --git a/python/cudf/cudf/_lib/strings/CMakeLists.txt b/python/cudf/cudf/_lib/strings/CMakeLists.txt
index 8ed5c5e03c1..a5e87a456cb 100644
--- a/python/cudf/cudf/_lib/strings/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/CMakeLists.txt
@@ -38,12 +38,8 @@ set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
 
-foreach(cython_module IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-  set_target_properties(${cython_module} PROPERTIES INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../cpp")
-endforeach()
-
 add_subdirectory(convert)
 add_subdirectory(split)
diff --git a/python/cudf/cudf/_lib/strings/__init__.py b/python/cudf/cudf/_lib/strings/__init__.py
index ff558a06d87..22a5066a20e 100644
--- a/python/cudf/cudf/_lib/strings/__init__.py
+++ b/python/cudf/cudf/_lib/strings/__init__.py
@@ -61,6 +61,7 @@
     startswith,
     startswith_multiple,
 )
+from cudf._lib.strings.find_multiple import find_multiple
 from cudf._lib.strings.findall import findall
 from cudf._lib.strings.json import GetJsonObjectOptions, get_json_object
 from cudf._lib.strings.padding import (
diff --git a/python/cudf/cudf/_lib/strings/attributes.pyx b/python/cudf/cudf/_lib/strings/attributes.pyx
index 8720fad7455..c1b69dda353 100644
--- a/python/cudf/cudf/_lib/strings/attributes.pyx
+++ b/python/cudf/cudf/_lib/strings/attributes.pyx
@@ -1,4 +1,6 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -13,6 +15,7 @@ from cudf._lib.cpp.strings.attributes cimport (
 )
 
 
+@acquire_spill_lock()
 def count_characters(Column source_strings):
     """
     Returns an integer numeric column containing the
@@ -27,6 +30,7 @@ def count_characters(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def count_bytes(Column source_strings):
     """
     Returns an integer numeric column containing the
@@ -41,6 +45,7 @@ def count_bytes(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def code_points(Column source_strings):
     """
     Creates a numeric column with code point values (integers)
diff --git a/python/cudf/cudf/_lib/strings/capitalize.pyx b/python/cudf/cudf/_lib/strings/capitalize.pyx
index 0bbdfa462e2..f6a80ac8fbe 100644
--- a/python/cudf/cudf/_lib/strings/capitalize.pyx
+++ b/python/cudf/cudf/_lib/strings/capitalize.pyx
@@ -1,4 +1,6 @@
-# Copyright (c) 2018-2021, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -13,6 +15,7 @@ from cudf._lib.cpp.strings.capitalize cimport (
 )
 
 
+@acquire_spill_lock()
 def capitalize(Column source_strings):
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
@@ -23,6 +26,7 @@ def capitalize(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def title(Column source_strings):
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
@@ -33,6 +37,7 @@ def title(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_title(Column source_strings):
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
diff --git a/python/cudf/cudf/_lib/strings/case.pyx b/python/cudf/cudf/_lib/strings/case.pyx
index 13679f3fb02..09af1178946 100644
--- a/python/cudf/cudf/_lib/strings/case.pyx
+++ b/python/cudf/cudf/_lib/strings/case.pyx
@@ -1,4 +1,6 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer import acquire_spill_lock
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
@@ -13,6 +15,7 @@ from cudf._lib.cpp.strings.case cimport (
 )
 
 
+@acquire_spill_lock()
 def to_upper(Column source_strings):
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
@@ -23,6 +26,7 @@ def to_upper(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def to_lower(Column source_strings):
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
@@ -33,6 +37,7 @@ def to_lower(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def swapcase(Column source_strings):
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
diff --git a/python/cudf/cudf/_lib/strings/char_types.pyx b/python/cudf/cudf/_lib/strings/char_types.pyx
index 3ef9db2345d..eb03d7c2192 100644
--- a/python/cudf/cudf/_lib/strings/char_types.pyx
+++ b/python/cudf/cudf/_lib/strings/char_types.pyx
@@ -1,9 +1,12 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
+
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -16,6 +19,7 @@ from cudf._lib.cpp.strings.char_types cimport (
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
     """
     Returns a Column of strings keeping only alphanumeric character types.
@@ -42,6 +46,7 @@ def filter_alphanum(Column source_strings, object py_repl, bool keep=True):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_decimal(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -61,6 +66,7 @@ def is_decimal(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_alnum(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -81,6 +87,7 @@ def is_alnum(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_alpha(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -99,6 +106,7 @@ def is_alpha(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_digit(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -117,6 +125,7 @@ def is_digit(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_numeric(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -136,6 +145,7 @@ def is_numeric(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_upper(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -154,6 +164,7 @@ def is_upper(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_lower(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -172,6 +183,7 @@ def is_lower(Column source_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_space(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
diff --git a/python/cudf/cudf/_lib/strings/combine.pyx b/python/cudf/cudf/_lib/strings/combine.pyx
index eeb39f70728..f38f4c5f847 100644
--- a/python/cudf/cudf/_lib/strings/combine.pyx
+++ b/python/cudf/cudf/_lib/strings/combine.pyx
@@ -1,7 +1,8 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+from cudf.core.buffer import acquire_spill_lock
+
 from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
 from libcpp.utility cimport move
 
 from cudf._lib.column cimport Column
@@ -16,11 +17,11 @@ from cudf._lib.cpp.strings.combine cimport (
     separator_on_nulls as separator_on_nulls,
 )
 from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport table_view_from_columns
 
 
+@acquire_spill_lock()
 def concatenate(list source_strings,
                 object sep,
                 object na_rep):
@@ -51,6 +52,7 @@ def concatenate(list source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def join(Column source_strings,
          object sep,
          object na_rep):
@@ -82,6 +84,7 @@ def join(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def join_lists_with_scalar(
         Column source_strings,
         object py_separator,
@@ -117,6 +120,7 @@ def join_lists_with_scalar(
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def join_lists_with_column(
         Column source_strings,
         Column separator_strings,
diff --git a/python/cudf/cudf/_lib/strings/contains.pyx b/python/cudf/cudf/_lib/strings/contains.pyx
index 41c4b54d8b1..7ca93b83921 100644
--- a/python/cudf/cudf/_lib/strings/contains.pyx
+++ b/python/cudf/cudf/_lib/strings/contains.pyx
@@ -1,6 +1,9 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport uint32_t
+
+from cudf.core.buffer import acquire_spill_lock
+
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -19,6 +22,7 @@ from cudf._lib.cpp.strings.regex_flags cimport regex_flags
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def contains_re(Column source_strings, object reg_ex, uint32_t flags):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -40,6 +44,7 @@ def contains_re(Column source_strings, object reg_ex, uint32_t flags):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def count_re(Column source_strings, object reg_ex, uint32_t flags):
     """
     Returns a Column with count of occurrences of `reg_ex` in
@@ -61,6 +66,7 @@ def count_re(Column source_strings, object reg_ex, uint32_t flags):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def match_re(Column source_strings, object reg_ex, uint32_t flags):
     """
     Returns a Column with each value True if the string matches `reg_ex`
@@ -82,6 +88,7 @@ def match_re(Column source_strings, object reg_ex, uint32_t flags):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def like(Column source_strings, object py_pattern, object py_escape):
     """
     Returns a Column with each value True if the string matches the
diff --git a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
index ea2e3943b5a..434f79d3b5f 100644
--- a/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/convert/CMakeLists.txt
@@ -20,9 +20,5 @@ set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
-
-foreach(cython_module IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-  set_target_properties(${cython_module} PROPERTIES INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../../cpp")
-endforeach()
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
index dfc9cae915f..177cbffddb0 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_fixed_point.pyx
@@ -1,17 +1,13 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
-import numpy as np
-
 import cudf
 
-from cudf._lib.column cimport Column
-
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
-
 from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
+from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
@@ -19,16 +15,10 @@ from cudf._lib.cpp.strings.convert.convert_fixed_point cimport (
     is_fixed_point as cpp_is_fixed_point,
     to_fixed_point as cpp_to_fixed_point,
 )
-from cudf._lib.cpp.types cimport (
-    DECIMAL32,
-    DECIMAL64,
-    DECIMAL128,
-    data_type,
-    type_id,
-)
-from cudf._lib.types cimport underlying_type_t_type_id
+from cudf._lib.cpp.types cimport DECIMAL32, DECIMAL64, DECIMAL128, data_type
 
 
+@acquire_spill_lock()
 def from_decimal(Column input_col):
     """
     Converts a `Decimal64Column` to a `StringColumn`.
@@ -51,6 +41,7 @@ def from_decimal(Column input_col):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def to_decimal(Column input_col, object out_type):
     """
     Returns a `Decimal64Column` from the provided `StringColumn`
@@ -88,6 +79,7 @@ def to_decimal(Column input_col, object out_type):
     return result
 
 
+@acquire_spill_lock()
 def is_fixed_point(Column input_col, object dtype):
     """
     Returns a Column of boolean values with True for `input_col`
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
index d47b1e6e651..d1617d85593 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_floats.pyx
@@ -1,9 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
-from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -12,6 +13,7 @@ from cudf._lib.cpp.strings.convert.convert_floats cimport (
 )
 
 
+@acquire_spill_lock()
 def is_float(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
index 08bcca93086..dc560c42182 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_integers.pyx
@@ -1,9 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
-from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -12,6 +13,7 @@ from cudf._lib.cpp.strings.convert.convert_integers cimport (
 )
 
 
+@acquire_spill_lock()
 def is_integer(Column source_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
index 7ffa69cd680..33f6d4a4af7 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_lists.pyx
@@ -1,8 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -16,6 +18,7 @@ from cudf._lib.scalar import as_device_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def format_list_column(Column source_list, Column separators):
     """
     Format a list column of strings into a strings column.
diff --git a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
index c391719e853..bc8123281f0 100644
--- a/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
+++ b/python/cudf/cudf/_lib/strings/convert/convert_urls.pyx
@@ -1,8 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -12,6 +14,7 @@ from cudf._lib.cpp.strings.convert.convert_urls cimport (
 )
 
 
+@acquire_spill_lock()
 def url_decode(Column source_strings):
     """
     Decode each string in column. No format checking is performed.
@@ -37,11 +40,12 @@ def url_decode(Column source_strings):
     )
 
 
+@acquire_spill_lock()
 def url_encode(Column source_strings):
     """
     Encode each string in column. No format checking is performed.
     All characters are encoded except for ASCII letters, digits,
-    and these characters: ‘.’,’_’,’-‘,’~’. Encoding converts to
+    and these characters: '.','_','-','~'. Encoding converts to
     hex using UTF-8 encoded bytes.
 
     Parameters
diff --git a/python/cudf/cudf/_lib/strings/extract.pyx b/python/cudf/cudf/_lib/strings/extract.pyx
index bac282dccc5..7d16e3e839d 100644
--- a/python/cudf/cudf/_lib/strings/extract.pyx
+++ b/python/cudf/cudf/_lib/strings/extract.pyx
@@ -5,16 +5,17 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.strings.extract cimport extract as cpp_extract
 from cudf._lib.cpp.strings.regex_flags cimport regex_flags
 from cudf._lib.cpp.table.table cimport table
-from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
+@acquire_spill_lock()
 def extract(Column source_strings, object pattern, uint32_t flags):
     """
     Returns data which contains extracted capture groups provided in
diff --git a/python/cudf/cudf/_lib/strings/find.pyx b/python/cudf/cudf/_lib/strings/find.pyx
index 788c0a2524a..f6dd3b80de9 100644
--- a/python/cudf/cudf/_lib/strings/find.pyx
+++ b/python/cudf/cudf/_lib/strings/find.pyx
@@ -1,8 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -18,6 +20,7 @@ from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def contains(Column source_strings, object py_target):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -41,6 +44,7 @@ def contains(Column source_strings, object py_target):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def contains_multiple(Column source_strings, Column target_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -59,6 +63,7 @@ def contains_multiple(Column source_strings, Column target_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def endswith(Column source_strings, object py_target):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -83,6 +88,7 @@ def endswith(Column source_strings, object py_target):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def endswith_multiple(Column source_strings, Column target_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -102,6 +108,7 @@ def endswith_multiple(Column source_strings, Column target_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def startswith(Column source_strings, object py_target):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -126,6 +133,7 @@ def startswith(Column source_strings, object py_target):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def startswith_multiple(Column source_strings, Column target_strings):
     """
     Returns a Column of boolean values with True for `source_strings`
@@ -145,6 +153,7 @@ def startswith_multiple(Column source_strings, Column target_strings):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def find(Column source_strings,
          object py_target,
          size_type start,
@@ -176,6 +185,7 @@ def find(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def rfind(Column source_strings,
           object py_target,
           size_type start,
diff --git a/python/cudf/cudf/_lib/strings/find_multiple.pyx b/python/cudf/cudf/_lib/strings/find_multiple.pyx
index 4ac86ce4ef5..c2a97a4fd7c 100644
--- a/python/cudf/cudf/_lib/strings/find_multiple.pyx
+++ b/python/cudf/cudf/_lib/strings/find_multiple.pyx
@@ -1,8 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -11,6 +13,7 @@ from cudf._lib.cpp.strings.find_multiple cimport (
 )
 
 
+@acquire_spill_lock()
 def find_multiple(Column source_strings, Column target_strings):
     """
     Returns a column with character position values where each
diff --git a/python/cudf/cudf/_lib/strings/findall.pyx b/python/cudf/cudf/_lib/strings/findall.pyx
index bb4159ba309..4080d346142 100644
--- a/python/cudf/cudf/_lib/strings/findall.pyx
+++ b/python/cudf/cudf/_lib/strings/findall.pyx
@@ -5,14 +5,16 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.strings.findall cimport findall as cpp_findall
 from cudf._lib.cpp.strings.regex_flags cimport regex_flags
-from cudf._lib.utils cimport data_from_unique_ptr
 
 
+@acquire_spill_lock()
 def findall(Column source_strings, object pattern, uint32_t flags):
     """
     Returns data with all non-overlapping matches of `pattern`
diff --git a/python/cudf/cudf/_lib/strings/json.pyx b/python/cudf/cudf/_lib/strings/json.pyx
index 1b1a9717e44..861e0daa6e3 100644
--- a/python/cudf/cudf/_lib/strings/json.pyx
+++ b/python/cudf/cudf/_lib/strings/json.pyx
@@ -1,9 +1,10 @@
 # Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
-from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -12,10 +13,10 @@ from cudf._lib.cpp.strings.json cimport (
     get_json_object as cpp_get_json_object,
     get_json_object_options,
 )
-from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def get_json_object(
         Column col, object py_json_path, GetJsonObjectOptions options):
     """
diff --git a/python/cudf/cudf/_lib/strings/padding.pyx b/python/cudf/cudf/_lib/strings/padding.pyx
index 9377870c1c1..340d7eb52d8 100644
--- a/python/cudf/cudf/_lib/strings/padding.pyx
+++ b/python/cudf/cudf/_lib/strings/padding.pyx
@@ -1,18 +1,17 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
+from libcpp.string cimport string
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.cpp.types cimport size_type
-from cudf._lib.scalar cimport DeviceScalar
 
 from enum import IntEnum
 
-from libcpp.string cimport string
-
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.strings.padding cimport pad as cpp_pad, zfill as cpp_zfill
 from cudf._lib.cpp.strings.side_type cimport (
@@ -27,6 +26,7 @@ class SideType(IntEnum):
     BOTH = <underlying_type_t_side_type> side_type.BOTH
 
 
+@acquire_spill_lock()
 def pad(Column source_strings,
         size_type width,
         fill_char,
@@ -57,11 +57,12 @@ def pad(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def zfill(Column source_strings,
           size_type width):
     """
     Returns a Column by prepending strings in `source_strings`
-    with ‘0’ characters up to the given `width`.
+    with '0' characters up to the given `width`.
     """
     cdef unique_ptr[column] c_result
     cdef column_view source_view = source_strings.view()
@@ -75,6 +76,7 @@ def zfill(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def center(Column source_strings,
            size_type width,
            fill_char):
@@ -99,6 +101,7 @@ def center(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def ljust(Column source_strings,
           size_type width,
           fill_char):
@@ -122,6 +125,7 @@ def ljust(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def rjust(Column source_strings,
           size_type width,
           fill_char):
diff --git a/python/cudf/cudf/_lib/strings/repeat.pyx b/python/cudf/cudf/_lib/strings/repeat.pyx
index 49a46f418b1..4896fb74f41 100644
--- a/python/cudf/cudf/_lib/strings/repeat.pyx
+++ b/python/cudf/cudf/_lib/strings/repeat.pyx
@@ -1,8 +1,10 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -10,6 +12,7 @@ from cudf._lib.cpp.strings cimport repeat as cpp_repeat
 from cudf._lib.cpp.types cimport size_type
 
 
+@acquire_spill_lock()
 def repeat_scalar(Column source_strings,
                   size_type repeats):
     """
@@ -29,6 +32,7 @@ def repeat_scalar(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def repeat_sequence(Column source_strings,
                     Column repeats):
     """
diff --git a/python/cudf/cudf/_lib/strings/replace.pyx b/python/cudf/cudf/_lib/strings/replace.pyx
index f5c47d2a2ed..80c9ba95fd8 100644
--- a/python/cudf/cudf/_lib/strings/replace.pyx
+++ b/python/cudf/cudf/_lib/strings/replace.pyx
@@ -1,9 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libc.stdint cimport int32_t
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -12,11 +14,11 @@ from cudf._lib.cpp.strings.replace cimport (
     replace as cpp_replace,
     replace_slice as cpp_replace_slice,
 )
-from cudf._lib.cpp.strings.substring cimport slice_strings as cpp_slice_strings
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def slice_replace(Column source_strings,
                   size_type start,
                   size_type stop,
@@ -47,6 +49,7 @@ def slice_replace(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def insert(Column source_strings,
            size_type start,
            object py_repl):
@@ -75,6 +78,7 @@ def insert(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def replace(Column source_strings,
             object py_target,
             object py_repl,
@@ -108,6 +112,7 @@ def replace(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def replace_multi(Column source_strings,
                   Column target_strings,
                   Column repl_strings):
diff --git a/python/cudf/cudf/_lib/strings/replace_re.pyx b/python/cudf/cudf/_lib/strings/replace_re.pyx
index 20fb903c60c..73911538db2 100644
--- a/python/cudf/cudf/_lib/strings/replace_re.pyx
+++ b/python/cudf/cudf/_lib/strings/replace_re.pyx
@@ -1,10 +1,12 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -17,6 +19,7 @@ from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def replace_re(Column source_strings,
                object pattern,
                object py_repl,
@@ -48,6 +51,7 @@ def replace_re(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def replace_with_backrefs(
         Column source_strings,
         object pattern,
@@ -73,6 +77,7 @@ def replace_with_backrefs(
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def replace_multi_re(Column source_strings,
                      object patterns,
                      Column repl_strings):
diff --git a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
index 2d23c0d21cb..59a22c06e85 100644
--- a/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
+++ b/python/cudf/cudf/_lib/strings/split/CMakeLists.txt
@@ -18,9 +18,5 @@ set(linked_libraries cudf::cudf)
 rapids_cython_create_modules(
   CXX
   SOURCE_FILES "${cython_sources}"
-  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_
+  LINKED_LIBRARIES "${linked_libraries}" MODULE_PREFIX strings_ ASSOCIATED_TARGETS cudf
 )
-
-foreach(cython_module IN LISTS RAPIDS_CYTHON_CREATED_TARGETS)
-  set_target_properties(${cython_module} PROPERTIES INSTALL_RPATH "\$ORIGIN;\$ORIGIN/../../cpp")
-endforeach()
diff --git a/python/cudf/cudf/_lib/strings/split/partition.pyx b/python/cudf/cudf/_lib/strings/split/partition.pyx
index b145b9ee52d..281d131372a 100644
--- a/python/cudf/cudf/_lib/strings/split/partition.pyx
+++ b/python/cudf/cudf/_lib/strings/split/partition.pyx
@@ -1,11 +1,11 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
-from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.cpp.strings.split.partition cimport (
@@ -13,12 +13,11 @@ from cudf._lib.cpp.strings.split.partition cimport (
     rpartition as cpp_rpartition,
 )
 from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
-from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
+@acquire_spill_lock()
 def partition(Column source_strings,
               object py_delimiter):
     """
@@ -46,6 +45,7 @@ def partition(Column source_strings,
     )
 
 
+@acquire_spill_lock()
 def rpartition(Column source_strings,
                object py_delimiter):
     """
diff --git a/python/cudf/cudf/_lib/strings/split/split.pyx b/python/cudf/cudf/_lib/strings/split/split.pyx
index ce066aa6aec..7a84cf75e37 100644
--- a/python/cudf/cudf/_lib/strings/split/split.pyx
+++ b/python/cudf/cudf/_lib/strings/split/split.pyx
@@ -4,6 +4,8 @@ from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -19,12 +21,12 @@ from cudf._lib.cpp.strings.split.split cimport (
     split_record_re as cpp_split_record_re,
 )
 from cudf._lib.cpp.table.table cimport table
-from cudf._lib.cpp.table.table_view cimport table_view
 from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 from cudf._lib.utils cimport data_from_unique_ptr
 
 
+@acquire_spill_lock()
 def split(Column source_strings,
           object py_delimiter,
           size_type maxsplit):
@@ -55,6 +57,7 @@ def split(Column source_strings,
     )
 
 
+@acquire_spill_lock()
 def split_record(Column source_strings,
                  object py_delimiter,
                  size_type maxsplit):
@@ -84,6 +87,7 @@ def split_record(Column source_strings,
     )
 
 
+@acquire_spill_lock()
 def rsplit(Column source_strings,
            object py_delimiter,
            size_type maxsplit):
@@ -114,6 +118,7 @@ def rsplit(Column source_strings,
     )
 
 
+@acquire_spill_lock()
 def rsplit_record(Column source_strings,
                   object py_delimiter,
                   size_type maxsplit):
@@ -143,6 +148,7 @@ def rsplit_record(Column source_strings,
     )
 
 
+@acquire_spill_lock()
 def split_re(Column source_strings,
              object pattern,
              size_type maxsplit):
@@ -167,6 +173,7 @@ def split_re(Column source_strings,
     )
 
 
+@acquire_spill_lock()
 def rsplit_re(Column source_strings,
               object pattern,
               size_type maxsplit):
@@ -192,6 +199,7 @@ def rsplit_re(Column source_strings,
     )
 
 
+@acquire_spill_lock()
 def split_record_re(Column source_strings,
                     object pattern,
                     size_type maxsplit):
@@ -215,6 +223,7 @@ def split_record_re(Column source_strings,
     )
 
 
+@acquire_spill_lock()
 def rsplit_record_re(Column source_strings,
                      object pattern,
                      size_type maxsplit):
diff --git a/python/cudf/cudf/_lib/strings/strip.pyx b/python/cudf/cudf/_lib/strings/strip.pyx
index 93dfbcedb83..2c53782d6ba 100644
--- a/python/cudf/cudf/_lib/strings/strip.pyx
+++ b/python/cudf/cudf/_lib/strings/strip.pyx
@@ -1,19 +1,20 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
-from libcpp.string cimport string
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.scalar.scalar cimport string_scalar
 from cudf._lib.cpp.strings.side_type cimport side_type
 from cudf._lib.cpp.strings.strip cimport strip as cpp_strip
-from cudf._lib.cpp.types cimport size_type
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def strip(Column source_strings,
           object py_repl):
     """
@@ -41,6 +42,7 @@ def strip(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def lstrip(Column source_strings,
            object py_repl):
     """
@@ -68,6 +70,7 @@ def lstrip(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def rstrip(Column source_strings,
            object py_repl):
     """
diff --git a/python/cudf/cudf/_lib/strings/substring.pyx b/python/cudf/cudf/_lib/strings/substring.pyx
index 761e9503aba..57bca09ee0e 100644
--- a/python/cudf/cudf/_lib/strings/substring.pyx
+++ b/python/cudf/cudf/_lib/strings/substring.pyx
@@ -1,16 +1,17 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
+import numpy as np
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
-from cudf._lib.cpp.types cimport size_type
-
-import numpy as np
-
 from cudf._lib.cpp.strings.substring cimport slice_strings as cpp_slice_strings
+from cudf._lib.cpp.types cimport size_type
 
 from cudf._lib.scalar import as_device_scalar
 
@@ -18,6 +19,7 @@ from cudf._lib.cpp.scalar.scalar cimport numeric_scalar
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def slice_strings(Column source_strings,
                   object start,
                   object end,
@@ -54,6 +56,7 @@ def slice_strings(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def slice_from(Column source_strings,
                Column starts,
                Column stops):
@@ -77,6 +80,7 @@ def slice_from(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def get(Column source_strings,
         object index):
     """
diff --git a/python/cudf/cudf/_lib/strings/translate.pyx b/python/cudf/cudf/_lib/strings/translate.pyx
index 7a5cf502ba3..262d479d914 100644
--- a/python/cudf/cudf/_lib/strings/translate.pyx
+++ b/python/cudf/cudf/_lib/strings/translate.pyx
@@ -1,4 +1,4 @@
-# Copyright (c) 2018-2020, NVIDIA CORPORATION.
+# Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
@@ -6,6 +6,8 @@ from libcpp.pair cimport pair
 from libcpp.utility cimport move
 from libcpp.vector cimport vector
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -19,6 +21,7 @@ from cudf._lib.cpp.types cimport char_utf8
 from cudf._lib.scalar cimport DeviceScalar
 
 
+@acquire_spill_lock()
 def translate(Column source_strings,
               object mapping_table):
     """
@@ -51,6 +54,7 @@ def translate(Column source_strings,
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def filter_characters(Column source_strings,
                       object mapping_table,
                       bool keep,
diff --git a/python/cudf/cudf/_lib/strings/wrap.pyx b/python/cudf/cudf/_lib/strings/wrap.pyx
index 5ebc33f77ef..8b0c367e791 100644
--- a/python/cudf/cudf/_lib/strings/wrap.pyx
+++ b/python/cudf/cudf/_lib/strings/wrap.pyx
@@ -1,8 +1,10 @@
-# Copyright (c) 2020, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
+from cudf.core.buffer import acquire_spill_lock
+
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
 from cudf._lib.cpp.column.column_view cimport column_view
@@ -10,6 +12,7 @@ from cudf._lib.cpp.strings.wrap cimport wrap as cpp_wrap
 from cudf._lib.cpp.types cimport size_type
 
 
+@acquire_spill_lock()
 def wrap(Column source_strings,
          size_type width):
     """
diff --git a/python/cudf/cudf/_lib/text.pyx b/python/cudf/cudf/_lib/text.pyx
index 868574be187..a7346cdd586 100644
--- a/python/cudf/cudf/_lib/text.pyx
+++ b/python/cudf/cudf/_lib/text.pyx
@@ -2,10 +2,9 @@
 
 from io import TextIOBase
 
-import cudf
-
 from cython.operator cimport dereference
-from libcpp.memory cimport make_unique, unique_ptr
+from libc.stdint cimport uint64_t
+from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
 
@@ -15,14 +14,19 @@ from cudf._lib.cpp.io.text cimport (
     byte_range_info,
     data_chunk_source,
     make_source,
+    make_source_from_bgzip_file,
     make_source_from_file,
     multibyte_split,
+    parse_options,
 )
 
 
 def read_text(object filepaths_or_buffers,
               object delimiter=None,
-              object byte_range=None):
+              object byte_range=None,
+              object strip_delimiters=False,
+              object compression=None,
+              object compression_offsets=None):
     """
     Cython function to call into libcudf API, see `multibyte_split`.
 
@@ -37,26 +41,48 @@ def read_text(object filepaths_or_buffers,
 
     cdef size_t c_byte_range_offset
     cdef size_t c_byte_range_size
-    cdef byte_range_info c_byte_range
+    cdef uint64_t c_compression_begin_offset
+    cdef uint64_t c_compression_end_offset
+    cdef parse_options c_options
 
-    if isinstance(filepaths_or_buffers, TextIOBase):
-        datasource = move(make_source(filepaths_or_buffers.read().encode()))
+    if compression is None:
+        if isinstance(filepaths_or_buffers, TextIOBase):
+            datasource = move(make_source(
+                filepaths_or_buffers.read().encode()))
+        else:
+            datasource = move(make_source_from_file(
+                filepaths_or_buffers.encode()))
+    elif compression == "bgzip":
+        if isinstance(filepaths_or_buffers, TextIOBase):
+            raise ValueError("bgzip compression requires a file path")
+        if compression_offsets is not None:
+            if len(compression_offsets) != 2:
+                raise ValueError(
+                    "compression offsets need to consist of two elements")
+            c_compression_begin_offset = compression_offsets[0]
+            c_compression_end_offset = compression_offsets[1]
+            datasource = move(make_source_from_bgzip_file(
+                filepaths_or_buffers.encode(),
+                c_compression_begin_offset,
+                c_compression_end_offset))
+        else:
+            datasource = move(make_source_from_bgzip_file(
+                filepaths_or_buffers.encode()))
     else:
-        datasource = move(make_source_from_file(filepaths_or_buffers.encode()))
+        raise ValueError("Only bgzip compression is supported at the moment")
 
-    if (byte_range is None):
-        with nogil:
-            c_col = move(multibyte_split(dereference(datasource), delim))
-    else:
+    c_options = parse_options()
+    if byte_range is not None:
         c_byte_range_offset = byte_range[0]
         c_byte_range_size = byte_range[1]
-        c_byte_range = byte_range_info(
+        c_options.byte_range = byte_range_info(
             c_byte_range_offset,
             c_byte_range_size)
-        with nogil:
-            c_col = move(multibyte_split(
-                dereference(datasource),
-                delim,
-                c_byte_range))
+    c_options.strip_delimiters = strip_delimiters
+    with nogil:
+        c_col = move(multibyte_split(
+            dereference(datasource),
+            delim,
+            c_options))
 
     return {None: Column.from_unique_ptr(move(c_col))}
diff --git a/python/cudf/cudf/_lib/transform.pyx b/python/cudf/cudf/_lib/transform.pyx
index 5fa45f68357..3787f1405b7 100644
--- a/python/cudf/cudf/_lib/transform.pyx
+++ b/python/cudf/cudf/_lib/transform.pyx
@@ -1,12 +1,11 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
-import numpy as np
 from numba.np import numpy_support
 
 import cudf
 from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
 from cudf.core._internals.expressions import parse_expression
-from cudf.core.buffer import as_device_buffer_like
+from cudf.core.buffer import acquire_spill_lock, as_buffer
 from cudf.utils import cudautils
 
 from cython.operator cimport dereference
@@ -31,28 +30,26 @@ from cudf._lib.types cimport underlying_type_t_type_id
 from cudf._lib.utils cimport (
     columns_from_unique_ptr,
     data_from_table_view,
-    data_from_unique_ptr,
     table_view_from_columns,
-    table_view_from_table,
 )
 
 
+@acquire_spill_lock()
 def bools_to_mask(Column col):
     """
     Given an int8 (boolean) column, compress the data from booleans to bits and
-    return a DeviceBufferLike
+    return a Buffer
     """
     cdef column_view col_view = col.view()
     cdef pair[unique_ptr[device_buffer], size_type] cpp_out
     cdef unique_ptr[device_buffer] up_db
-    cdef size_type null_count
 
     with nogil:
         cpp_out = move(libcudf_transform.bools_to_mask(col_view))
         up_db = move(cpp_out.first)
 
     rmm_db = DeviceBuffer.c_from_unique_ptr(move(up_db))
-    buf = as_device_buffer_like(rmm_db)
+    buf = as_buffer(rmm_db)
     return buf
 
 
@@ -61,9 +58,9 @@ def mask_to_bools(object mask_buffer, size_type begin_bit, size_type end_bit):
     Given a mask buffer, returns a boolean column representng bit 0 -> False
     and 1 -> True within range of [begin_bit, end_bit),
     """
-    if not isinstance(mask_buffer, cudf.core.buffer.DeviceBufferLike):
+    if not isinstance(mask_buffer, cudf.core.buffer.Buffer):
         raise TypeError("mask_buffer is not an instance of "
-                        "cudf.core.buffer.DeviceBufferLike")
+                        "cudf.core.buffer.Buffer")
     cdef bitmask_type* bit_mask = <bitmask_type*><uintptr_t>(mask_buffer.ptr)
 
     cdef unique_ptr[column] result
@@ -88,10 +85,11 @@ def nans_to_nulls(Column input):
         return None
 
     buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer))
-    buffer = as_device_buffer_like(buffer)
+    buffer = as_buffer(buffer)
     return buffer
 
 
+@acquire_spill_lock()
 def transform(Column input, op):
     cdef column_view c_input = input.view()
     cdef string c_str
@@ -136,8 +134,10 @@ def table_encode(list source_columns):
     with nogil:
         c_result = move(libcudf_transform.encode(c_input))
 
-    return columns_from_unique_ptr(
-        move(c_result.first)), Column.from_unique_ptr(move(c_result.second))
+    return (
+        columns_from_unique_ptr(move(c_result.first)),
+        Column.from_unique_ptr(move(c_result.second))
+    )
 
 
 def one_hot_encode(Column input_column, Column categories):
@@ -150,7 +150,11 @@ def one_hot_encode(Column input_column, Column categories):
             libcudf_transform.one_hot_encode(c_view_input, c_view_categories)
         )
 
-    owner = Column.from_unique_ptr(move(c_result.first))
+    # Notice, the data pointer of `owner` has been exposed
+    # through `c_result.second` at this point.
+    owner = Column.from_unique_ptr(
+        move(c_result.first), data_ptr_exposed=True
+    )
 
     pylist_categories = categories.to_arrow().to_pylist()
     encodings, _ = data_from_table_view(
@@ -160,10 +164,10 @@ def one_hot_encode(Column input_column, Column categories):
             x if x is not None else 'null' for x in pylist_categories
         ]
     )
-
     return encodings
 
 
+@acquire_spill_lock()
 def compute_column(list columns, tuple column_names, expr: str):
     """Compute a new column by evaluating an expression on a set of columns.
 
diff --git a/python/cudf/cudf/_lib/transpose.pyx b/python/cudf/cudf/_lib/transpose.pyx
index b9eea6169bd..51e49b1f27a 100644
--- a/python/cudf/cudf/_lib/transpose.pyx
+++ b/python/cudf/cudf/_lib/transpose.pyx
@@ -20,7 +20,11 @@ def transpose(list source_columns):
     with nogil:
         c_result = move(cpp_transpose(c_input))
 
-    result_owner = Column.from_unique_ptr(move(c_result.first))
+    # Notice, the data pointer of `result_owner` has been exposed
+    # through `c_result.second` at this point.
+    result_owner = Column.from_unique_ptr(
+        move(c_result.first), data_ptr_exposed=True
+    )
     return columns_from_table_view(
         c_result.second,
         owners=[result_owner] * c_result.second.num_columns()
diff --git a/python/cudf/cudf/_lib/types.pyx b/python/cudf/cudf/_lib/types.pyx
index aa70ab99ebd..c306d398123 100644
--- a/python/cudf/cudf/_lib/types.pyx
+++ b/python/cudf/cudf/_lib/types.pyx
@@ -11,7 +11,6 @@ from cudf._lib.cpp.column.column_view cimport column_view
 from cudf._lib.cpp.lists.lists_column_view cimport lists_column_view
 from cudf._lib.types cimport (
     underlying_type_t_interpolation,
-    underlying_type_t_null_order,
     underlying_type_t_order,
     underlying_type_t_sorted,
 )
diff --git a/python/cudf/cudf/_lib/unary.pyx b/python/cudf/cudf/_lib/unary.pyx
index acca61cf9d1..7ef4d00b9ff 100644
--- a/python/cudf/cudf/_lib/unary.pyx
+++ b/python/cudf/cudf/_lib/unary.pyx
@@ -1,32 +1,22 @@
-# Copyright (c) 2020-2021, NVIDIA CORPORATION.
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
 from enum import IntEnum
 
 from cudf.api.types import is_decimal_dtype
+from cudf.core.buffer import acquire_spill_lock
 
-from libcpp cimport bool
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
 import numpy as np
 
+cimport cudf._lib.cpp.unary as libcudf_unary
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column
-from cudf._lib.cpp.column.column_view cimport column_view, mutable_column_view
-
-from cudf._lib.types import SUPPORTED_NUMPY_TO_LIBCUDF_TYPES
-
-from cudf._lib.cpp.types cimport data_type, size_type, type_id
-
-from cudf._lib.column import (
-    LIBCUDF_TO_SUPPORTED_NUMPY_TYPES,
-    SUPPORTED_NUMPY_TO_LIBCUDF_TYPES,
-)
-
-cimport cudf._lib.cpp.types as libcudf_types
-cimport cudf._lib.cpp.unary as libcudf_unary
+from cudf._lib.cpp.column.column_view cimport column_view
+from cudf._lib.cpp.types cimport data_type
 from cudf._lib.cpp.unary cimport unary_operator, underlying_type_t_unary_op
-from cudf._lib.types cimport dtype_to_data_type, underlying_type_t_type_id
+from cudf._lib.types cimport dtype_to_data_type
 
 
 class UnaryOp(IntEnum):
@@ -54,6 +44,7 @@ class UnaryOp(IntEnum):
     NOT = <underlying_type_t_unary_op> unary_operator.NOT
 
 
+@acquire_spill_lock()
 def unary_operation(Column input, object op):
     cdef column_view c_input = input.view()
     cdef unary_operator c_op = <unary_operator>(<underlying_type_t_unary_op>
@@ -71,6 +62,7 @@ def unary_operation(Column input, object op):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_null(Column input):
     cdef column_view c_input = input.view()
     cdef unique_ptr[column] c_result
@@ -81,6 +73,7 @@ def is_null(Column input):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_valid(Column input):
     cdef column_view c_input = input.view()
     cdef unique_ptr[column] c_result
@@ -91,6 +84,7 @@ def is_valid(Column input):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def cast(Column input, object dtype=np.float64):
     cdef column_view c_input = input.view()
     cdef data_type c_dtype = dtype_to_data_type(dtype)
@@ -106,6 +100,7 @@ def cast(Column input, object dtype=np.float64):
     return result
 
 
+@acquire_spill_lock()
 def is_nan(Column input):
     cdef column_view c_input = input.view()
     cdef unique_ptr[column] c_result
@@ -116,6 +111,7 @@ def is_nan(Column input):
     return Column.from_unique_ptr(move(c_result))
 
 
+@acquire_spill_lock()
 def is_non_nan(Column input):
     cdef column_view c_input = input.view()
     cdef unique_ptr[column] c_result
diff --git a/python/cudf/cudf/_lib/utils.pyx b/python/cudf/cudf/_lib/utils.pyx
index e0bdc7d8f74..5f4d3e17fbc 100644
--- a/python/cudf/cudf/_lib/utils.pyx
+++ b/python/cudf/cudf/_lib/utils.pyx
@@ -6,7 +6,6 @@ import pyarrow as pa
 import cudf
 
 from cython.operator cimport dereference
-from libc.stdint cimport uint8_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.utility cimport move
@@ -341,8 +340,8 @@ cdef data_from_table_view(
     along with referencing an ``owner`` Python object that owns the memory
     lifetime. If ``owner`` is a Frame we reach inside of it and
     reach inside of each ``cudf.Column`` to make the owner of each newly
-    created ``DeviceBufferLike`` underneath the ``cudf.Column`` objects of the
-    created Frame the respective ``DeviceBufferLike`` from the relevant
+    created ``Buffer`` underneath the ``cudf.Column`` objects of the
+    created Frame the respective ``Buffer`` from the relevant
     ``cudf.Column`` of the ``owner`` Frame
     """
     cdef size_type column_idx = 0
diff --git a/python/cudf/cudf/benchmarks/README.md b/python/cudf/cudf/benchmarks/README.md
index 0d704f2a825..5486176550a 100644
--- a/python/cudf/cudf/benchmarks/README.md
+++ b/python/cudf/cudf/benchmarks/README.md
@@ -30,5 +30,3 @@ pytest --use_buffer True cudf/benchmarks/
 ```
 pytest --dataset_dir directory_path cudf/benchmarks/
 ```
-
-
diff --git a/python/cudf/cudf/core/_base_index.py b/python/cudf/cudf/core/_base_index.py
index 6898ae4941c..b73536558f1 100644
--- a/python/cudf/cudf/core/_base_index.py
+++ b/python/cudf/cudf/core/_base_index.py
@@ -27,10 +27,7 @@
 from cudf.core.column import ColumnBase, column
 from cudf.core.column_accessor import ColumnAccessor
 from cudf.utils import ioutils
-from cudf.utils.dtypes import (
-    is_mixed_with_object_dtype,
-    numeric_normalize_types,
-)
+from cudf.utils.dtypes import is_mixed_with_object_dtype
 
 _index_astype_docstring = """\
 Create an Index with values cast to dtypes.
@@ -90,7 +87,7 @@ def size(self):
 
     @property
     def values(self):
-        return self._values.values
+        raise NotImplementedError
 
     def get_loc(self, key, method=None, tolerance=None):
         raise NotImplementedError
@@ -188,12 +185,7 @@ def _clean_nulls_from_index(self):
         methods using this method to replace or handle representation
         of the actual types correctly.
         """
-        if self._values.has_nulls():
-            return cudf.Index(
-                self._values.astype("str").fillna(cudf._NA_REP), name=self.name
-            )
-        else:
-            return self
+        raise NotImplementedError
 
     @property
     def is_monotonic(self):
@@ -549,13 +541,11 @@ def to_frame(self, index=True, name=None):
             Set the index of the returned DataFrame as the original Index
         name : str, default None
             Name to be used for the column
-
         Returns
         -------
         DataFrame
             cudf DataFrame
         """
-
         if name is not None:
             col_name = name
         elif self.name is None:
@@ -570,7 +560,40 @@ def any(self):
         """
         Return whether any elements is True in Index.
         """
-        return self._values.any()
+        raise NotImplementedError
+
+    def isna(self):
+        """
+        Detect missing values.
+
+        Return a boolean same-sized object indicating if the values are NA.
+        NA values, such as ``None``, :attr:`numpy.NaN` or :attr:`cudf.NaN`, get
+        mapped to ``True`` values.
+        Everything else get mapped to ``False`` values.
+
+        Returns
+        -------
+        numpy.ndarray[bool]
+            A boolean array to indicate which entries are NA.
+
+        """
+        raise NotImplementedError
+
+    def notna(self):
+        """
+        Detect existing (non-missing) values.
+
+        Return a boolean same-sized object indicating if the values are not NA.
+        Non-missing values get mapped to ``True``.
+        NA values, such as None or :attr:`numpy.NaN`, get mapped to ``False``
+        values.
+
+        Returns
+        -------
+        numpy.ndarray[bool]
+            A boolean array to indicate which entries are not NA.
+        """
+        raise NotImplementedError
 
     def to_pandas(self):
         """
@@ -589,7 +612,75 @@ def to_pandas(self):
         >>> type(idx)
         <class 'cudf.core.index.Int64Index'>
         """
-        return pd.Index(self._values.to_pandas(), name=self.name)
+        raise NotImplementedError
+
+    def isin(self, values):
+        """Return a boolean array where the index values are in values.
+
+        Compute boolean array of whether each index value is found in
+        the passed set of values. The length of the returned boolean
+        array matches the length of the index.
+
+        Parameters
+        ----------
+        values : set, list-like, Index
+            Sought values.
+
+        Returns
+        -------
+        is_contained : cupy array
+            CuPy array of boolean values.
+
+        Examples
+        --------
+        >>> idx = cudf.Index([1,2,3])
+        >>> idx
+        Int64Index([1, 2, 3], dtype='int64')
+
+        Check whether each index value in a list of values.
+
+        >>> idx.isin([1, 4])
+        array([ True, False, False])
+        """
+        # To match pandas behavior, even though only list-like objects are
+        # supposed to be passed, only scalars throw errors. Other types (like
+        # dicts) just transparently return False (see the implementation of
+        # ColumnBase.isin).
+        raise NotImplementedError
+
+    def unique(self):
+        """
+        Return unique values in the index.
+
+        Returns
+        -------
+        Index without duplicates
+        """
+        raise NotImplementedError
+
+    def to_series(self, index=None, name=None):
+        """
+        Create a Series with both index and values equal to the index keys.
+        Useful with map for returning an indexer based on an index.
+
+        Parameters
+        ----------
+        index : Index, optional
+            Index of resulting Series. If None, defaults to original index.
+        name : str, optional
+            Name of resulting Series. If None, defaults to name of original
+            index.
+
+        Returns
+        -------
+        Series
+            The dtype will be based on the type of the Index values.
+        """
+        return cudf.Series._from_data(
+            self._data,
+            index=self.copy(deep=False) if index is None else index,
+            name=self.name if name is None else name,
+        )
 
     @ioutils.doc_to_dlpack()
     def to_dlpack(self):
@@ -599,7 +690,7 @@ def to_dlpack(self):
 
     def append(self, other):
         """
-        Append a collection of Index options together.
+        Append a collection of Index objects together.
 
         Parameters
         ----------
@@ -626,45 +717,7 @@ def append(self, other):
         >>> idx.append([other, other])
         Int64Index([1, 2, 10, 100, 200, 400, 50, 200, 400, 50], dtype='int64')
         """
-
-        if is_list_like(other):
-            to_concat = [self]
-            to_concat.extend(other)
-        else:
-            this = self
-            if len(other) == 0:
-                # short-circuit and return a copy
-                to_concat = [self]
-
-            other = cudf.Index(other)
-
-            if len(self) == 0:
-                to_concat = [other]
-
-            if len(self) and len(other):
-                if is_mixed_with_object_dtype(this, other):
-                    got_dtype = (
-                        other.dtype
-                        if this.dtype == cudf.dtype("object")
-                        else this.dtype
-                    )
-                    raise TypeError(
-                        f"cudf does not support appending an Index of "
-                        f"dtype `{cudf.dtype('object')}` with an Index "
-                        f"of dtype `{got_dtype}`, please type-cast "
-                        f"either one of them to same dtypes."
-                    )
-
-                if isinstance(self._values, cudf.core.column.NumericalColumn):
-                    if self.dtype != other.dtype:
-                        this, other = numeric_normalize_types(self, other)
-                to_concat = [this, other]
-
-        for obj in to_concat:
-            if not isinstance(obj, BaseIndex):
-                raise TypeError("all inputs must be Index")
-
-        return self._concat(to_concat)
+        raise NotImplementedError
 
     def difference(self, other, sort=None):
         """
@@ -1119,18 +1172,6 @@ def sort_values(
         else:
             return index_sorted
 
-    def unique(self):
-        """
-        Return unique values in the index.
-
-        Returns
-        -------
-        Index without duplicates
-        """
-        return cudf.core.index._index_from_data(
-            {self.name: self._values.unique()}, name=self.name
-        )
-
     def join(
         self, other, how="left", level=None, return_indexers=False, sort=False
     ):
@@ -1263,30 +1304,6 @@ def rename(self, name, inplace=False):
             out.name = name
             return out
 
-    def to_series(self, index=None, name=None):
-        """
-        Create a Series with both index and values equal to the index keys.
-        Useful with map for returning an indexer based on an index.
-
-        Parameters
-        ----------
-        index : Index, optional
-            Index of resulting Series. If None, defaults to original index.
-        name : str, optional
-            Dame of resulting Series. If None, defaults to name of original
-            index.
-
-        Returns
-        -------
-        Series
-            The dtype will be based on the type of the Index values.
-        """
-        return cudf.Series(
-            self._values,
-            index=self.copy(deep=False) if index is None else index,
-            name=self.name if name is None else name,
-        )
-
     def get_slice_bound(self, label, side, kind=None):
         """
         Calculate slice bound that corresponds to given label.
@@ -1339,47 +1356,6 @@ def __array_function__(self, func, types, args, kwargs):
         else:
             return NotImplemented
 
-    def isin(self, values):
-        """Return a boolean array where the index values are in values.
-
-        Compute boolean array of whether each index value is found in
-        the passed set of values. The length of the returned boolean
-        array matches the length of the index.
-
-        Parameters
-        ----------
-        values : set, list-like, Index
-            Sought values.
-
-        Returns
-        -------
-        is_contained : cupy array
-            CuPy array of boolean values.
-
-        Examples
-        --------
-        >>> idx = cudf.Index([1,2,3])
-        >>> idx
-        Int64Index([1, 2, 3], dtype='int64')
-
-        Check whether each index value in a list of values.
-
-        >>> idx.isin([1, 4])
-        array([ True, False, False])
-        """
-
-        # To match pandas behavior, even though only list-like objects are
-        # supposed to be passed, only scalars throw errors. Other types (like
-        # dicts) just transparently return False (see the implementation of
-        # ColumnBase.isin).
-        if is_scalar(values):
-            raise TypeError(
-                "only list-like objects are allowed to be passed "
-                f"to isin(), you passed a {type(values).__name__}"
-            )
-
-        return self._values.isin(values).values
-
     @classmethod
     def from_pandas(cls, index, nan_as_null=None):
         """
diff --git a/python/cudf/cudf/core/_compat.py b/python/cudf/cudf/core/_compat.py
index 5534d732f53..3889fcc4cc0 100644
--- a/python/cudf/cudf/core/_compat.py
+++ b/python/cudf/cudf/core/_compat.py
@@ -4,7 +4,6 @@
 from packaging import version
 
 PANDAS_VERSION = version.parse(pd.__version__)
-PANDAS_GE_100 = PANDAS_VERSION >= version.parse("1.0")
 PANDAS_GE_110 = PANDAS_VERSION >= version.parse("1.1")
 PANDAS_GE_120 = PANDAS_VERSION >= version.parse("1.2")
 PANDAS_LE_122 = PANDAS_VERSION <= version.parse("1.2.2")
diff --git a/python/cudf/cudf/core/abc.py b/python/cudf/cudf/core/abc.py
index dcbf96313a7..adf9fe39e4f 100644
--- a/python/cudf/cudf/core/abc.py
+++ b/python/cudf/cudf/core/abc.py
@@ -1,20 +1,12 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 """Common abstract base classes for cudf."""
 
-import sys
+import pickle
 
-import rmm
+import numpy
 
 import cudf
 
-if sys.version_info < (3, 8):
-    try:
-        import pickle5 as pickle
-    except ImportError:
-        import pickle  # type: ignore
-else:
-    import pickle  # type: ignore
-
 
 class Serializable:
     """A serializable object composed of device memory buffers.
@@ -90,14 +82,14 @@ def device_serialize(self):
         header : dict
             The metadata required to reconstruct the object.
         frames : list
-            The DeviceBufferLike or memoryview objects that the object
+            The Buffer or memoryview objects that the object
             should contain.
 
         :meta private:
         """
         header, frames = self.serialize()
         assert all(
-            isinstance(f, (cudf.core.buffer.DeviceBufferLike, memoryview))
+            isinstance(f, (cudf.core.buffer.Buffer, memoryview))
             for f in frames
         )
         header["type-serialized"] = pickle.dumps(type(self))
@@ -132,18 +124,10 @@ def device_deserialize(cls, header, frames):
         """
         typ = pickle.loads(header["type-serialized"])
         frames = [
-            cudf.core.buffer.as_device_buffer_like(f) if c else memoryview(f)
+            cudf.core.buffer.as_buffer(f) if c else memoryview(f)
             for c, f in zip(header["is-cuda"], frames)
         ]
-        assert all(
-            (type(f._owner) is rmm.DeviceBuffer)
-            if c
-            else (type(f) is memoryview)
-            for c, f in zip(header["is-cuda"], frames)
-        )
-        obj = typ.deserialize(header, frames)
-
-        return obj
+        return typ.deserialize(header, frames)
 
     def host_serialize(self):
         """Serialize data and metadata associated with host memory.
@@ -186,7 +170,7 @@ def host_deserialize(cls, header, frames):
         :meta private:
         """
         frames = [
-            rmm.DeviceBuffer.to_device(f) if c else f
+            cudf.core.buffer.as_buffer(f) if c else f
             for c, f in zip(header["is-cuda"], map(memoryview, frames))
         ]
         obj = cls.device_deserialize(header, frames)
@@ -194,5 +178,9 @@ def host_deserialize(cls, header, frames):
 
     def __reduce_ex__(self, protocol):
         header, frames = self.host_serialize()
-        frames = [f.obj for f in frames]
+
+        # Since memoryviews are not pickable, we convert them to numpy
+        # arrays (zero-copy). This works seamlessly because host_deserialize
+        # converts the frames back into memoryviews.
+        frames = [numpy.asarray(f) for f in frames]
         return self.host_deserialize, (header, frames)
diff --git a/python/cudf/cudf/core/buffer.py b/python/cudf/cudf/core/buffer.py
deleted file mode 100644
index 647e747e127..00000000000
--- a/python/cudf/cudf/core/buffer.py
+++ /dev/null
@@ -1,325 +0,0 @@
-# Copyright (c) 2020-2022, NVIDIA CORPORATION.
-
-from __future__ import annotations
-
-import math
-import pickle
-from typing import (
-    Any,
-    Dict,
-    List,
-    Mapping,
-    Protocol,
-    Sequence,
-    Tuple,
-    Union,
-    runtime_checkable,
-)
-
-import numpy as np
-
-import rmm
-
-import cudf
-from cudf.core.abc import Serializable
-from cudf.utils.string import format_bytes
-
-# Frame type for serialization and deserialization of `DeviceBufferLike`
-Frame = Union[memoryview, "DeviceBufferLike"]
-
-
-@runtime_checkable
-class DeviceBufferLike(Protocol):
-    def __getitem__(self, key: slice) -> DeviceBufferLike:
-        """Create a new view of the buffer."""
-
-    @property
-    def size(self) -> int:
-        """Size of the buffer in bytes."""
-
-    @property
-    def nbytes(self) -> int:
-        """Size of the buffer in bytes."""
-
-    @property
-    def ptr(self) -> int:
-        """Device pointer to the start of the buffer."""
-
-    @property
-    def owner(self) -> Any:
-        """Object owning the memory of the buffer."""
-
-    @property
-    def __cuda_array_interface__(self) -> Mapping:
-        """Implementation of the CUDA Array Interface."""
-
-    def memoryview(self) -> memoryview:
-        """Read-only access to the buffer through host memory."""
-
-    def serialize(self) -> Tuple[dict, List[Frame]]:
-        """Serialize the buffer into header and frames.
-
-        The frames can be a mixture of memoryview and device-buffer-like
-        objects.
-
-        Returns
-        -------
-        Tuple[Dict, List]
-            The first element of the returned tuple is a dict containing any
-            serializable metadata required to reconstruct the object. The
-            second element is a list containing the device buffers and
-            memoryviews of the object.
-        """
-
-    @classmethod
-    def deserialize(
-        cls, header: dict, frames: List[Frame]
-    ) -> DeviceBufferLike:
-        """Generate an buffer from a serialized representation.
-
-        Parameters
-        ----------
-        header : dict
-            The metadata required to reconstruct the object.
-        frames : list
-            The device-buffer-like and memoryview buffers that the object
-            should contain.
-
-        Returns
-        -------
-        DeviceBufferLike
-            A new object that implements DeviceBufferLike.
-        """
-
-
-def as_device_buffer_like(obj: Any) -> DeviceBufferLike:
-    """
-    Factory function to wrap `obj` in a DeviceBufferLike object.
-
-    If `obj` isn't device-buffer-like already, a new buffer that implements
-    DeviceBufferLike and points to the memory of `obj` is created. If `obj`
-    represents host memory, it is copied to a new `rmm.DeviceBuffer` device
-    allocation. Otherwise, the data of `obj` is **not** copied, instead the
-    new buffer keeps a reference to `obj` in order to retain the lifetime
-    of `obj`.
-
-    Raises ValueError if the data of `obj` isn't C-contiguous.
-
-    Parameters
-    ----------
-    obj : buffer-like or array-like
-        An object that exposes either device or host memory through
-        `__array_interface__`, `__cuda_array_interface__`, or the
-        buffer protocol. If `obj` represents host memory, data will
-        be copied.
-
-    Return
-    ------
-    DeviceBufferLike
-        A device-buffer-like instance that represents the device memory
-        of `obj`.
-    """
-
-    if isinstance(obj, DeviceBufferLike):
-        return obj
-    return Buffer(obj)
-
-
-class Buffer(Serializable):
-    """
-    A Buffer represents device memory.
-
-    Usually Buffers will be created using `as_device_buffer_like(obj)`,
-    which will make sure that `obj` is device-buffer-like and not a `Buffer`
-    necessarily.
-
-    Parameters
-    ----------
-    data : int or buffer-like or array-like
-        An integer representing a pointer to device memory or a buffer-like
-        or array-like object. When not an integer, `size` and `owner` must
-        be None.
-    size : int, optional
-        Size of device memory in bytes. Must be specified if `data` is an
-        integer.
-    owner : object, optional
-        Python object to which the lifetime of the memory allocation is tied.
-        A reference to this object is kept in the returned Buffer.
-    """
-
-    _ptr: int
-    _size: int
-    _owner: object
-
-    def __init__(
-        self, data: Union[int, Any], *, size: int = None, owner: object = None
-    ):
-        if isinstance(data, int):
-            if size is None:
-                raise ValueError(
-                    "size must be specified when `data` is an integer"
-                )
-            if size < 0:
-                raise ValueError("size cannot be negative")
-            self._ptr = data
-            self._size = size
-            self._owner = owner
-        else:
-            if size is not None or owner is not None:
-                raise ValueError(
-                    "`size` and `owner` must be None when "
-                    "`data` is a buffer-like object"
-                )
-
-            # `data` is a buffer-like object
-            buf: Any = data
-            if isinstance(buf, rmm.DeviceBuffer):
-                self._ptr = buf.ptr
-                self._size = buf.size
-                self._owner = buf
-                return
-            iface = getattr(buf, "__cuda_array_interface__", None)
-            if iface:
-                ptr, size = get_ptr_and_size(iface)
-                self._ptr = ptr
-                self._size = size
-                self._owner = buf
-                return
-            ptr, size = get_ptr_and_size(np.asarray(buf).__array_interface__)
-            buf = rmm.DeviceBuffer(ptr=ptr, size=size)
-            self._ptr = buf.ptr
-            self._size = buf.size
-            self._owner = buf
-
-    def __getitem__(self, key: slice) -> Buffer:
-        if not isinstance(key, slice):
-            raise ValueError("index must be an slice")
-        start, stop, step = key.indices(self.size)
-        if step != 1:
-            raise ValueError("slice must be contiguous")
-        return self.__class__(
-            data=self.ptr + start, size=stop - start, owner=self.owner
-        )
-
-    @property
-    def size(self) -> int:
-        return self._size
-
-    @property
-    def nbytes(self) -> int:
-        return self._size
-
-    @property
-    def ptr(self) -> int:
-        return self._ptr
-
-    @property
-    def owner(self) -> Any:
-        return self._owner
-
-    @property
-    def __cuda_array_interface__(self) -> dict:
-        return {
-            "data": (self.ptr, False),
-            "shape": (self.size,),
-            "strides": None,
-            "typestr": "|u1",
-            "version": 0,
-        }
-
-    def memoryview(self) -> memoryview:
-        host_buf = bytearray(self.size)
-        rmm._lib.device_buffer.copy_ptr_to_host(self.ptr, host_buf)
-        return memoryview(host_buf).toreadonly()
-
-    def serialize(self) -> Tuple[dict, list]:
-        header = {}  # type: Dict[Any, Any]
-        header["type-serialized"] = pickle.dumps(type(self))
-        header["constructor-kwargs"] = {}
-        header["desc"] = self.__cuda_array_interface__.copy()
-        header["desc"]["strides"] = (1,)
-        header["frame_count"] = 1
-        frames = [self]
-        return header, frames
-
-    @classmethod
-    def deserialize(cls, header: dict, frames: list) -> Buffer:
-        assert (
-            header["frame_count"] == 1
-        ), "Only expecting to deserialize Buffer with a single frame."
-        buf = cls(frames[0], **header["constructor-kwargs"])
-
-        if header["desc"]["shape"] != buf.__cuda_array_interface__["shape"]:
-            raise ValueError(
-                f"Received a `Buffer` with the wrong size."
-                f" Expected {header['desc']['shape']}, "
-                f"but got {buf.__cuda_array_interface__['shape']}"
-            )
-
-        return buf
-
-    def __repr__(self) -> str:
-        return (
-            f"<cudf.core.buffer.Buffer size={format_bytes(self._size)} "
-            f"ptr={hex(self._ptr)} owner={repr(self._owner)} "
-        )
-
-
-def is_c_contiguous(
-    shape: Sequence[int], strides: Sequence[int], itemsize: int
-) -> bool:
-    """
-    Determine if shape and strides are C-contiguous
-
-    Parameters
-    ----------
-    shape : Sequence[int]
-        Number of elements in each dimension.
-    strides : Sequence[int]
-        The stride of each dimension in bytes.
-    itemsize : int
-        Size of an element in bytes.
-
-    Return
-    ------
-    bool
-        The boolean answer.
-    """
-
-    if any(dim == 0 for dim in shape):
-        return True
-    cumulative_stride = itemsize
-    for dim, stride in zip(reversed(shape), reversed(strides)):
-        if dim > 1 and stride != cumulative_stride:
-            return False
-        cumulative_stride *= dim
-    return True
-
-
-def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
-    """
-    Retrieve the pointer and size from an array interface.
-
-    Raises ValueError if array isn't C-contiguous.
-
-    Parameters
-    ----------
-    array_interface : Mapping
-        The array interface metadata.
-
-    Return
-    ------
-    pointer : int
-        The pointer to device or host memory
-    size : int
-        The size in bytes
-    """
-
-    shape = array_interface["shape"] or (1,)
-    strides = array_interface["strides"]
-    itemsize = cudf.dtype(array_interface["typestr"]).itemsize
-    if strides is None or is_c_contiguous(shape, strides, itemsize):
-        nelem = math.prod(shape)
-        ptr = array_interface["data"][0] or 0
-        return ptr, nelem * itemsize
-    raise ValueError("Buffer data must be C-contiguous")
diff --git a/python/cudf/cudf/core/buffer/__init__.py b/python/cudf/cudf/core/buffer/__init__.py
new file mode 100644
index 00000000000..49f2c57b17f
--- /dev/null
+++ b/python/cudf/cudf/core/buffer/__init__.py
@@ -0,0 +1,9 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock
+from cudf.core.buffer.utils import (
+    acquire_spill_lock,
+    as_buffer,
+    get_spill_lock,
+)
diff --git a/python/cudf/cudf/core/buffer/buffer.py b/python/cudf/cudf/core/buffer/buffer.py
new file mode 100644
index 00000000000..73e589ebb8e
--- /dev/null
+++ b/python/cudf/cudf/core/buffer/buffer.py
@@ -0,0 +1,319 @@
+# Copyright (c) 2020-2022, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import math
+import pickle
+from types import SimpleNamespace
+from typing import Any, Dict, Mapping, Sequence, Tuple, Type, TypeVar
+
+import numpy
+
+import rmm
+
+import cudf
+from cudf.core.abc import Serializable
+from cudf.utils.string import format_bytes
+
+T = TypeVar("T", bound="Buffer")
+
+
+def cuda_array_interface_wrapper(
+    ptr: int,
+    size: int,
+    owner: object = None,
+    readonly=False,
+    typestr="|u1",
+    version=0,
+):
+    """Wrap device pointer in an object that exposes `__cuda_array_interface__`
+
+    See <https://numba.readthedocs.io/en/stable/cuda/cuda_array_interface.html>
+
+    Parameters
+    ----------
+    ptr : int
+        An integer representing a pointer to device memory.
+    size : int, optional
+        Size of device memory in bytes.
+    owner : object, optional
+        Python object to which the lifetime of the memory allocation is tied.
+        A reference to this object is kept in the returned wrapper object.
+    readonly: bool, optional
+        Mark the interface read-only.
+    typestr: str, optional
+        The type string of the interface. By default this is "|u1", which
+        means "an unsigned integer with a not relevant byteorder". See:
+        <https://numpy.org/doc/stable/reference/arrays.interface.html>
+    version : bool, optional
+        The version of the interface.
+
+    Return
+    ------
+    SimpleNamespace
+        An object that exposes `__cuda_array_interface__` and keeps a reference
+        to `owner`.
+    """
+
+    if size < 0:
+        raise ValueError("size cannot be negative")
+
+    return SimpleNamespace(
+        __cuda_array_interface__={
+            "data": (ptr, readonly),
+            "shape": (size,),
+            "strides": None,
+            "typestr": typestr,
+            "version": version,
+        },
+        owner=owner,
+    )
+
+
+class Buffer(Serializable):
+    """A Buffer represents device memory.
+
+    Use the factory function `as_buffer` to create a Buffer instance.
+    """
+
+    _ptr: int
+    _size: int
+    _owner: object
+
+    def __init__(self):
+        raise ValueError(
+            f"do not create a {self.__class__} directly, please "
+            "use the factory function `cudf.core.buffer.as_buffer`"
+        )
+
+    @classmethod
+    def _from_device_memory(cls: Type[T], data: Any) -> T:
+        """Create a Buffer from an object exposing `__cuda_array_interface__`.
+
+        No data is being copied.
+
+        Parameters
+        ----------
+        data : device-buffer-like
+            An object implementing the CUDA Array Interface.
+
+        Returns
+        -------
+        Buffer
+            Buffer representing the same device memory as `data`
+        """
+
+        # Bypass `__init__` and initialize attributes manually
+        ret = cls.__new__(cls)
+        ret._owner = data
+        if isinstance(data, rmm.DeviceBuffer):  # Common case shortcut
+            ret._ptr = data.ptr
+            ret._size = data.size
+        else:
+            ret._ptr, ret._size = get_ptr_and_size(
+                data.__cuda_array_interface__
+            )
+        if ret.size < 0:
+            raise ValueError("size cannot be negative")
+        return ret
+
+    @classmethod
+    def _from_host_memory(cls: Type[T], data: Any) -> T:
+        """Create a Buffer from a buffer or array like object
+
+        Data must implement `__array_interface__`, the buffer protocol, and/or
+        be convertible to a buffer object using `numpy.array()`
+
+        The host memory is copied to a new device allocation.
+
+        Raises ValueError if array isn't C-contiguous.
+
+        Parameters
+        ----------
+        data : Any
+            An object that represens host memory.
+
+        Returns
+        -------
+        Buffer
+            Buffer representing a copy of `data`.
+        """
+
+        # Convert to numpy array, this will not copy data in most cases.
+        ary = numpy.array(data, copy=False, subok=True)
+        # Extract pointer and size
+        ptr, size = get_ptr_and_size(ary.__array_interface__)
+        # Copy to device memory
+        buf = rmm.DeviceBuffer(ptr=ptr, size=size)
+        # Create from device memory
+        return cls._from_device_memory(buf)
+
+    def _getitem(self, offset: int, size: int) -> Buffer:
+        """
+        Sub-classes can overwrite this to implement __getitem__
+        without having to handle non-slice inputs.
+        """
+        return self._from_device_memory(
+            cuda_array_interface_wrapper(
+                ptr=self.ptr + offset, size=size, owner=self.owner
+            )
+        )
+
+    def __getitem__(self, key: slice) -> Buffer:
+        """Create a new slice of the buffer."""
+        if not isinstance(key, slice):
+            raise TypeError(
+                "Argument 'key' has incorrect type "
+                f"(expected slice, got {key.__class__.__name__})"
+            )
+        start, stop, step = key.indices(self.size)
+        if step != 1:
+            raise ValueError("slice must be C-contiguous")
+        return self._getitem(offset=start, size=stop - start)
+
+    @property
+    def size(self) -> int:
+        """Size of the buffer in bytes."""
+        return self._size
+
+    @property
+    def nbytes(self) -> int:
+        """Size of the buffer in bytes."""
+        return self._size
+
+    @property
+    def ptr(self) -> int:
+        """Device pointer to the start of the buffer."""
+        return self._ptr
+
+    @property
+    def owner(self) -> Any:
+        """Object owning the memory of the buffer."""
+        return self._owner
+
+    @property
+    def __cuda_array_interface__(self) -> Mapping:
+        """Implementation of the CUDA Array Interface."""
+        return {
+            "data": (self.ptr, False),
+            "shape": (self.size,),
+            "strides": None,
+            "typestr": "|u1",
+            "version": 0,
+        }
+
+    def memoryview(self) -> memoryview:
+        """Read-only access to the buffer through host memory."""
+        host_buf = bytearray(self.size)
+        rmm._lib.device_buffer.copy_ptr_to_host(self.ptr, host_buf)
+        return memoryview(host_buf).toreadonly()
+
+    def serialize(self) -> Tuple[dict, list]:
+        """Serialize the buffer into header and frames.
+
+        The frames can be a mixture of memoryview and Buffer objects.
+
+        Returns
+        -------
+        Tuple[dict, List]
+            The first element of the returned tuple is a dict containing any
+            serializable metadata required to reconstruct the object. The
+            second element is a list containing Buffers and memoryviews.
+        """
+        header: Dict[str, Any] = {}
+        header["type-serialized"] = pickle.dumps(type(self))
+        header["frame_count"] = 1
+        frames = [self]
+        return header, frames
+
+    @classmethod
+    def deserialize(cls: Type[T], header: dict, frames: list) -> T:
+        """Create an Buffer from a serialized representation.
+
+        Parameters
+        ----------
+        header : dict
+            The metadata required to reconstruct the object.
+        frames : list
+            The Buffer and memoryview that makes up the Buffer.
+
+        Returns
+        -------
+        Buffer
+            The deserialized Buffer.
+        """
+        if header["frame_count"] != 1:
+            raise ValueError("Deserializing a Buffer expect a single frame")
+        frame = frames[0]
+        if isinstance(frame, cls):
+            return frame  # The frame is already deserialized
+
+        if hasattr(frame, "__cuda_array_interface__"):
+            return cls._from_device_memory(frame)
+        return cls._from_host_memory(frame)
+
+    def __repr__(self) -> str:
+        klass = self.__class__
+        name = f"{klass.__module__}.{klass.__qualname__}"
+        return (
+            f"<{name} size={format_bytes(self._size)} "
+            f"ptr={hex(self._ptr)} owner={repr(self._owner)}>"
+        )
+
+
+def is_c_contiguous(
+    shape: Sequence[int], strides: Sequence[int], itemsize: int
+) -> bool:
+    """Determine if shape and strides are C-contiguous
+
+    Parameters
+    ----------
+    shape : Sequence[int]
+        Number of elements in each dimension.
+    strides : Sequence[int]
+        The stride of each dimension in bytes.
+    itemsize : int
+        Size of an element in bytes.
+
+    Return
+    ------
+    bool
+        The boolean answer.
+    """
+
+    if any(dim == 0 for dim in shape):
+        return True
+    cumulative_stride = itemsize
+    for dim, stride in zip(reversed(shape), reversed(strides)):
+        if dim > 1 and stride != cumulative_stride:
+            return False
+        cumulative_stride *= dim
+    return True
+
+
+def get_ptr_and_size(array_interface: Mapping) -> Tuple[int, int]:
+    """Retrieve the pointer and size from an array interface.
+
+    Raises ValueError if array isn't C-contiguous.
+
+    Parameters
+    ----------
+    array_interface : Mapping
+        The array interface metadata.
+
+    Return
+    ------
+    pointer : int
+        The pointer to device or host memory
+    size : int
+        The size in bytes
+    """
+
+    shape = array_interface["shape"] or (1,)
+    strides = array_interface["strides"]
+    itemsize = cudf.dtype(array_interface["typestr"]).itemsize
+    if strides is None or is_c_contiguous(shape, strides, itemsize):
+        nelem = math.prod(shape)
+        ptr = array_interface["data"][0] or 0
+        return ptr, nelem * itemsize
+    raise ValueError("Buffer data must be C-contiguous")
diff --git a/python/cudf/cudf/core/buffer/spill_manager.py b/python/cudf/cudf/core/buffer/spill_manager.py
new file mode 100644
index 00000000000..5ea1b90928b
--- /dev/null
+++ b/python/cudf/cudf/core/buffer/spill_manager.py
@@ -0,0 +1,297 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import gc
+import io
+import threading
+import traceback
+import warnings
+import weakref
+from typing import List, Optional, Tuple
+
+import rmm.mr
+
+from cudf.core.buffer.spillable_buffer import SpillableBuffer
+from cudf.options import get_option
+from cudf.utils.string import format_bytes
+
+
+def get_traceback() -> str:
+    """Pretty print current traceback to a string"""
+    with io.StringIO() as f:
+        traceback.print_stack(file=f)
+        f.seek(0)
+        return f.read()
+
+
+def get_rmm_memory_resource_stack(
+    mr: rmm.mr.DeviceMemoryResource,
+) -> List[rmm.mr.DeviceMemoryResource]:
+    """Get the RMM resource stack
+
+    Parameters
+    ----------
+    mr : rmm.mr.DeviceMemoryResource
+        Top of the resource stack
+
+    Return
+    ------
+    list
+        List of RMM resources
+    """
+
+    if hasattr(mr, "upstream_mr"):
+        return [mr] + get_rmm_memory_resource_stack(mr.upstream_mr)
+    return [mr]
+
+
+class SpillManager:
+    """Manager of spillable buffers.
+
+    This class implements tracking of all known spillable buffers, on-demand
+    spilling of said buffers, and (optionally) maintains a memory usage limit.
+
+    When `spill_on_demand=True`, the manager registers an RMM out-of-memory
+    error handler, which will spill spillable buffers in order to free up
+    memory.
+
+    When `device_memory_limit=True`, the manager will try keep the device
+    memory usage below the specified limit by spilling of spillable buffers
+    continuously, which will introduce a modest overhead.
+
+    Parameters
+    ----------
+    spill_on_demand : bool
+        Enable spill on demand.
+    device_memory_limit: int, optional
+        If not None, this is the device memory limit in bytes that triggers
+        device to host spilling. The global manager sets this to the value
+        of `CUDF_SPILL_DEVICE_LIMIT` or None.
+    """
+
+    _buffers: weakref.WeakValueDictionary[int, SpillableBuffer]
+
+    def __init__(
+        self,
+        *,
+        spill_on_demand: bool = False,
+        device_memory_limit: int = None,
+    ) -> None:
+        self._lock = threading.Lock()
+        self._buffers = weakref.WeakValueDictionary()
+        self._id_counter = 0
+        self._spill_on_demand = spill_on_demand
+        self._device_memory_limit = device_memory_limit
+
+        if self._spill_on_demand:
+            # Set the RMM out-of-memory handle if not already set
+            mr = rmm.mr.get_current_device_resource()
+            if all(
+                not isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
+                for m in get_rmm_memory_resource_stack(mr)
+            ):
+                rmm.mr.set_current_device_resource(
+                    rmm.mr.FailureCallbackResourceAdaptor(
+                        mr, self._out_of_memory_handle
+                    )
+                )
+
+    def _out_of_memory_handle(self, nbytes: int, *, retry_once=True) -> bool:
+        """Try to handle an out-of-memory error by spilling
+
+        This can by used as the callback function to RMM's
+        `FailureCallbackResourceAdaptor`
+
+        Parameters
+        ----------
+        nbytes : int
+            Number of bytes to try to spill.
+        retry_once : bool, optional
+            If True, call `gc.collect()` and retry once.
+
+        Return
+        ------
+        bool
+            True if any buffers were freed otherwise False.
+
+        Warning
+        -------
+        In order to avoid deadlock, this function should not lock
+        already locked buffers.
+        """
+
+        # Let's try to spill device memory
+        spilled = self.spill_device_memory(nbytes=nbytes)
+
+        if spilled > 0:
+            return True  # Ask RMM to retry the allocation
+
+        if retry_once:
+            # Let's collect garbage and try one more time
+            gc.collect()
+            return self._out_of_memory_handle(nbytes, retry_once=False)
+
+        # TODO: write to log instead of stdout
+        print(
+            f"[WARNING] RMM allocation of {format_bytes(nbytes)} bytes "
+            "failed, spill-on-demand couldn't find any device memory to "
+            f"spill:\n{repr(self)}\ntraceback:\n{get_traceback()}"
+        )
+        return False  # Since we didn't find anything to spill, we give up
+
+    def add(self, buffer: SpillableBuffer) -> None:
+        """Add buffer to the set of managed buffers
+
+        The manager keeps a weak reference to the buffer
+
+        Parameters
+        ----------
+        buffer : SpillableBuffer
+            The buffer to manage
+        """
+        if buffer.size > 0 and not buffer.exposed:
+            with self._lock:
+                self._buffers[self._id_counter] = buffer
+                self._id_counter += 1
+        self.spill_to_device_limit()
+
+    def buffers(
+        self, order_by_access_time: bool = False
+    ) -> Tuple[SpillableBuffer, ...]:
+        """Get all managed buffers
+
+        Parameters
+        ----------
+        order_by_access_time : bool, optional
+            Order the buffer by access time (ascending order)
+
+        Return
+        ------
+        tuple
+            Tuple of buffers
+        """
+        with self._lock:
+            ret = tuple(self._buffers.values())
+        if order_by_access_time:
+            ret = tuple(sorted(ret, key=lambda b: b.last_accessed))
+        return ret
+
+    def spill_device_memory(self, nbytes: int) -> int:
+        """Try to spill device memory
+
+        This function is safe to call doing spill-on-demand
+        since it does not lock buffers already locked.
+
+        Parameters
+        ----------
+        nbytes : int
+            Number of bytes to try to spill
+
+        Return
+        ------
+        int
+            Number of actually bytes spilled.
+        """
+        spilled = 0
+        for buf in self.buffers(order_by_access_time=True):
+            if buf.lock.acquire(blocking=False):
+                try:
+                    if not buf.is_spilled and buf.spillable:
+                        buf.spill(target="cpu")
+                        spilled += buf.size
+                        if spilled >= nbytes:
+                            break
+                finally:
+                    buf.lock.release()
+        return spilled
+
+    def spill_to_device_limit(self, device_limit: int = None) -> int:
+        """Spill until device limit
+
+        Notice, by default this is a no-op.
+
+        Parameters
+        ----------
+        device_limit : int, optional
+            Limit in bytes. If None, the value of the environment variable
+            `CUDF_SPILL_DEVICE_LIMIT` is used. If this is not set, the method
+            does nothing and returns 0.
+
+        Return
+        ------
+        int
+            The number of bytes spilled.
+        """
+        limit = (
+            self._device_memory_limit if device_limit is None else device_limit
+        )
+        if limit is None:
+            return 0
+        ret = 0
+        while True:
+            unspilled = sum(
+                buf.size for buf in self.buffers() if not buf.is_spilled
+            )
+            if unspilled < limit:
+                break
+            nbytes = self.spill_device_memory(nbytes=limit - unspilled)
+            if nbytes == 0:
+                break  # No more to spill
+            ret += nbytes
+        return ret
+
+    def __repr__(self) -> str:
+        spilled = sum(buf.size for buf in self.buffers() if buf.is_spilled)
+        unspilled = sum(
+            buf.size for buf in self.buffers() if not buf.is_spilled
+        )
+        unspillable = 0
+        for buf in self.buffers():
+            if not (buf.is_spilled or buf.spillable):
+                unspillable += buf.size
+        unspillable_ratio = unspillable / unspilled if unspilled else 0
+
+        return (
+            f"<SpillManager spill_on_demand={self._spill_on_demand} "
+            f"device_memory_limit={self._device_memory_limit} | "
+            f"{format_bytes(spilled)} spilled | "
+            f"{format_bytes(unspilled)} ({unspillable_ratio:.0%}) "
+            f"unspilled (unspillable)>"
+        )
+
+
+# The global manager has three states:
+#   - Uninitialized
+#   - Initialized to None (spilling disabled)
+#   - Initialized to a SpillManager instance (spilling enabled)
+_global_manager_uninitialized: bool = True
+_global_manager: Optional[SpillManager] = None
+
+
+def set_global_manager(manager: Optional[SpillManager]) -> None:
+    """Set the global manager, which if None disables spilling"""
+
+    global _global_manager, _global_manager_uninitialized
+    if _global_manager is not None:
+        gc.collect()
+        buffers = _global_manager.buffers()
+        if len(buffers) > 0:
+            warnings.warn(f"overwriting non-empty manager: {buffers}")
+
+    _global_manager = manager
+    _global_manager_uninitialized = False
+
+
+def get_global_manager() -> Optional[SpillManager]:
+    """Get the global manager or None if spilling is disabled"""
+    global _global_manager_uninitialized
+    if _global_manager_uninitialized:
+        manager = None
+        if get_option("spill"):
+            manager = SpillManager(
+                spill_on_demand=get_option("spill_on_demand"),
+                device_memory_limit=get_option("spill_device_limit"),
+            )
+        set_global_manager(manager)
+    return _global_manager
diff --git a/python/cudf/cudf/core/buffer/spillable_buffer.py b/python/cudf/cudf/core/buffer/spillable_buffer.py
new file mode 100644
index 00000000000..c42216be279
--- /dev/null
+++ b/python/cudf/cudf/core/buffer/spillable_buffer.py
@@ -0,0 +1,474 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import collections.abc
+import pickle
+import time
+import weakref
+from threading import RLock
+from typing import TYPE_CHECKING, Any, Dict, List, Tuple, Type, TypeVar
+
+import numpy
+
+import rmm
+
+from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.utils.string import format_bytes
+
+if TYPE_CHECKING:
+    from cudf.core.buffer.spill_manager import SpillManager
+
+
+T = TypeVar("T", bound="SpillableBuffer")
+
+
+class SpillLock:
+    pass
+
+
+class DelayedPointerTuple(collections.abc.Sequence):
+    """
+    A delayed version of the "data" field in __cuda_array_interface__.
+
+    The idea is to delay the access to `Buffer.ptr` until the user
+    actually accesses the data pointer.
+
+    For instance, in many cases __cuda_array_interface__ is accessed
+    only to determine whether an object is a CUDA object or not.
+
+    TODO: this doesn't support libraries such as PyTorch that declare
+    the tuple of __cuda_array_interface__["data"] in Cython. In such
+    cases, Cython will raise an error because DelayedPointerTuple
+    isn't a "real" tuple.
+    """
+
+    def __init__(self, buffer) -> None:
+        self._buf = buffer
+
+    def __len__(self):
+        return 2
+
+    def __getitem__(self, i):
+        if i == 0:
+            return self._buf.ptr
+        elif i == 1:
+            return False
+        raise IndexError("tuple index out of range")
+
+
+class SpillableBuffer(Buffer):
+    """A spillable buffer that implements DeviceBufferLike.
+
+    This buffer supports spilling the represented data to host memory.
+    Spilling can be done manually by calling `.spill(target="cpu")` but
+    usually the associated spilling manager triggers spilling based on current
+    device memory usage see `cudf.core.buffer.spill_manager.SpillManager`.
+    Unspill is triggered automatically when accessing the data of the buffer.
+
+    The buffer might not be spillable, which is based on the "expose" status
+    of the buffer. We say that the buffer has been exposed if the device
+    pointer (integer or void*) has been accessed outside of SpillableBuffer.
+    In this case, we cannot invalidate the device pointer by moving the data
+    to host.
+
+    A buffer can be exposed permanently at creation or by accessing the `.ptr`
+    property. To avoid this, one can use `.get_ptr()` instead, which support
+    exposing the buffer temporarily.
+
+    Use the factory function `as_buffer` to create a SpillableBuffer instance.
+    """
+
+    lock: RLock
+    _spill_locks: weakref.WeakSet
+    _last_accessed: float
+    _ptr_desc: Dict[str, Any]
+    _exposed: bool
+    _manager: SpillManager
+
+    def _finalize_init(self, ptr_desc: Dict[str, Any], exposed: bool) -> None:
+        """Finish initialization of the spillable buffer
+
+        This implements the common initialization that `_from_device_memory`
+        and `_from_host_memory` are missing.
+
+        Parameters
+        ----------
+        ptr_desc : dict
+            Description of the memory.
+        exposed : bool, optional
+            Mark the buffer as permanently exposed (unspillable).
+        """
+
+        from cudf.core.buffer.spill_manager import get_global_manager
+
+        self.lock = RLock()
+        self._spill_locks = weakref.WeakSet()
+        self._last_accessed = time.monotonic()
+        self._ptr_desc = ptr_desc
+        self._exposed = exposed
+        manager = get_global_manager()
+        if manager is None:
+            raise ValueError(
+                f"cannot create {self.__class__} without "
+                "a global spill manager"
+            )
+
+        self._manager = manager
+        self._manager.add(self)
+
+    @classmethod
+    def _from_device_memory(
+        cls: Type[T], data: Any, *, exposed: bool = False
+    ) -> T:
+        """Create a spillabe buffer from device memory.
+
+        No data is being copied.
+
+        Parameters
+        ----------
+        data : device-buffer-like
+            An object implementing the CUDA Array Interface.
+        exposed : bool, optional
+            Mark the buffer as permanently exposed (unspillable).
+
+        Returns
+        -------
+        SpillableBuffer
+            Buffer representing the same device memory as `data`
+        """
+        ret = super()._from_device_memory(data)
+        ret._finalize_init(ptr_desc={"type": "gpu"}, exposed=exposed)
+        return ret
+
+    @classmethod
+    def _from_host_memory(cls: Type[T], data: Any) -> T:
+        """Create a spillabe buffer from host memory.
+
+        Data must implement `__array_interface__`, the buffer protocol, and/or
+        be convertible to a buffer object using `numpy.array()`
+
+        The new buffer is marked as spilled to host memory already.
+
+        Raises ValueError if array isn't C-contiguous.
+
+        Parameters
+        ----------
+        data : Any
+            An object that represens host memory.
+
+        Returns
+        -------
+        SpillableBuffer
+            Buffer representing a copy of `data`.
+        """
+
+        # Convert to a memoryview using numpy array, this will not copy data
+        # in most cases.
+        data = memoryview(numpy.array(data, copy=False, subok=True))
+        if not data.c_contiguous:
+            raise ValueError("Buffer data must be C-contiguous")
+        data = data.cast("B")  # Make sure itemsize==1
+
+        # Create an already spilled buffer
+        ret = cls.__new__(cls)
+        ret._owner = None
+        ret._ptr = 0
+        ret._size = data.nbytes
+        ret._finalize_init(
+            ptr_desc={"type": "cpu", "memoryview": data}, exposed=False
+        )
+        return ret
+
+    @property
+    def is_spilled(self) -> bool:
+        return self._ptr_desc["type"] != "gpu"
+
+    def spill(self, target: str = "cpu") -> None:
+        """Spill or un-spill this buffer in-place
+
+        Parameters
+        ----------
+        target : str
+            The target of the spilling.
+        """
+
+        with self.lock:
+            ptr_type = self._ptr_desc["type"]
+            if ptr_type == target:
+                return
+
+            if not self.spillable:
+                raise ValueError(
+                    f"Cannot in-place move an unspillable buffer: {self}"
+                )
+
+            if (ptr_type, target) == ("gpu", "cpu"):
+                host_mem = memoryview(bytearray(self.size))
+                rmm._lib.device_buffer.copy_ptr_to_host(self._ptr, host_mem)
+                self._ptr_desc["memoryview"] = host_mem
+                self._ptr = 0
+                self._owner = None
+            elif (ptr_type, target) == ("cpu", "gpu"):
+                # Notice, this operation is prone to deadlock because the RMM
+                # allocation might trigger spilling-on-demand which in turn
+                # trigger a new call to this buffer's `spill()`.
+                # Therefore, it is important that spilling-on-demand doesn't
+                # try to unspill an already locked buffer!
+                dev_mem = rmm.DeviceBuffer.to_device(
+                    self._ptr_desc.pop("memoryview")
+                )
+                self._ptr = dev_mem.ptr
+                self._owner = dev_mem
+                assert self._size == dev_mem.size
+            else:
+                # TODO: support moving to disk
+                raise ValueError(f"Unknown target: {target}")
+            self._ptr_desc["type"] = target
+
+    @property
+    def ptr(self) -> int:
+        """Access the memory directly
+
+        Notice, this will mark the buffer as "exposed" and make
+        it unspillable permanently.
+
+        Consider using `.get_ptr()` instead.
+        """
+
+        self._manager.spill_to_device_limit()
+        with self.lock:
+            self.spill(target="gpu")
+            self._exposed = True
+            self._last_accessed = time.monotonic()
+            return self._ptr
+
+    def spill_lock(self, spill_lock: SpillLock) -> None:
+        """Spill lock the buffer
+
+        Mark the buffer as unspillable while `spill_lock` is alive,
+        which is tracked by monitoring a weakref to `spill_lock`.
+
+        Parameters
+        ----------
+        spill_lock : SpillLock
+            The object that defines the scope of the lock.
+        """
+
+        if spill_lock is None:
+            spill_lock = SpillLock()
+        with self.lock:
+            self.spill(target="gpu")
+            self._spill_locks.add(spill_lock)
+
+    def get_ptr(self, spill_lock: SpillLock = None) -> int:
+        """Get a device pointer to the memory of the buffer.
+
+        If spill_lock is not None, a reference to this buffer is added
+        to spill_lock, which disable spilling of this buffer while
+        spill_lock is alive.
+
+        Parameters
+        ----------
+        spill_lock : SpillLock, optional
+            Adding a reference of this buffer to the spill lock.
+
+        Return
+        ------
+        int
+            The device pointer as an integer
+        """
+
+        if spill_lock is None:
+            return self.ptr  # expose the buffer permanently
+
+        self.spill_lock(spill_lock)
+        self._last_accessed = time.monotonic()
+        return self._ptr
+
+    @property
+    def owner(self) -> Any:
+        return self._owner
+
+    @property
+    def exposed(self) -> bool:
+        return self._exposed
+
+    @property
+    def spillable(self) -> bool:
+        return not self._exposed and len(self._spill_locks) == 0
+
+    @property
+    def size(self) -> int:
+        return self._size
+
+    @property
+    def nbytes(self) -> int:
+        return self._size
+
+    @property
+    def last_accessed(self) -> float:
+        return self._last_accessed
+
+    @property
+    def __cuda_array_interface__(self) -> dict:
+        return {
+            "data": DelayedPointerTuple(self),
+            "shape": (self.size,),
+            "strides": None,
+            "typestr": "|u1",
+            "version": 0,
+        }
+
+    def memoryview(self, *, offset: int = 0, size: int = None) -> memoryview:
+        size = self._size if size is None else size
+        with self.lock:
+            if self.spillable:
+                self.spill(target="cpu")
+                return self._ptr_desc["memoryview"][offset : offset + size]
+            else:
+                assert self._ptr_desc["type"] == "gpu"
+                ret = memoryview(bytearray(size))
+                rmm._lib.device_buffer.copy_ptr_to_host(
+                    self._ptr + offset, ret
+                )
+                return ret
+
+    def _getitem(self, offset: int, size: int) -> Buffer:
+        return SpillableBufferSlice(base=self, offset=offset, size=size)
+
+    def serialize(self) -> Tuple[dict, list]:
+        """Serialize the Buffer
+
+        Normally, we would use `[self]` as the frames. This would work but
+        also mean that `self` becomes exposed permanently if the frames are
+        later accessed through `__cuda_array_interface__`, which is exactly
+        what libraries like Dask+UCX would do when communicating!
+
+        The sound solution is to modify Dask et al. so that they access the
+        frames through `.get_ptr()` and holds on to the `spill_lock` until
+        the frame has been transferred. However, until this adaptation we
+        use a hack where the frame is a `Buffer` with a `spill_lock` as the
+        owner, which makes `self` unspillable while the frame is alive but
+        doesn't expose `self` when `__cuda_array_interface__` is accessed.
+
+        Warning, this hack means that the returned frame must be copied before
+        given to `.deserialize()`, otherwise we would have a `Buffer` pointing
+        to memory already owned by an existing `SpillableBuffer`.
+        """
+        header: Dict[Any, Any]
+        frames: List[Buffer | memoryview]
+        with self.lock:
+            header = {}
+            header["type-serialized"] = pickle.dumps(self.__class__)
+            header["frame_count"] = 1
+            if self.is_spilled:
+                frames = [self.memoryview()]
+            else:
+                # TODO: Use `frames=[self]` instead of this hack, see doc above
+                spill_lock = SpillLock()
+                ptr = self.get_ptr(spill_lock=spill_lock)
+                frames = [
+                    Buffer._from_device_memory(
+                        cuda_array_interface_wrapper(
+                            ptr=ptr,
+                            size=self.size,
+                            owner=(self._owner, spill_lock),
+                        )
+                    )
+                ]
+            return header, frames
+
+    def __repr__(self) -> str:
+        if self._ptr_desc["type"] != "gpu":
+            ptr_info = str(self._ptr_desc)
+        else:
+            ptr_info = str(hex(self._ptr))
+        return (
+            f"<SpillableBuffer size={format_bytes(self._size)} "
+            f"spillable={self.spillable} exposed={self.exposed} "
+            f"num-spill-locks={len(self._spill_locks)} "
+            f"ptr={ptr_info} owner={repr(self._owner)}>"
+        )
+
+
+class SpillableBufferSlice(SpillableBuffer):
+    """A slice of a spillable buffer
+
+    This buffer applies the slicing and then delegates all
+    operations to its base buffer.
+
+    Parameters
+    ----------
+    base : SpillableBuffer
+        The base of the view
+    offset : int
+        Memory offset into the base buffer
+    size : int
+        Size of the view (in bytes)
+    """
+
+    def __init__(self, base: SpillableBuffer, offset: int, size: int) -> None:
+        if size < 0:
+            raise ValueError("size cannot be negative")
+        if offset < 0:
+            raise ValueError("offset cannot be negative")
+        if offset + size > base.size:
+            raise ValueError(
+                "offset+size cannot be greater than the size of base"
+            )
+        self._base = base
+        self._offset = offset
+        self._size = size
+        self._owner = base
+        self.lock = base.lock
+
+    @property
+    def ptr(self) -> int:
+        return self._base.ptr + self._offset
+
+    def get_ptr(self, spill_lock: SpillLock = None) -> int:
+        return self._base.get_ptr(spill_lock=spill_lock) + self._offset
+
+    def _getitem(self, offset: int, size: int) -> Buffer:
+        return SpillableBufferSlice(
+            base=self._base, offset=offset + self._offset, size=size
+        )
+
+    @classmethod
+    def deserialize(cls, header: dict, frames: list):
+        # TODO: because of the hack in `SpillableBuffer.serialize()` where
+        # frames are of type `Buffer`, we always deserialize as if they are
+        # `SpillableBuffer`. In the future, we should be able to
+        # deserialize into `SpillableBufferSlice` when the frames hasn't been
+        # copied.
+        return SpillableBuffer.deserialize(header, frames)
+
+    def memoryview(self, *, offset: int = 0, size: int = None) -> memoryview:
+        size = self._size if size is None else size
+        return self._base.memoryview(offset=self._offset + offset, size=size)
+
+    def __repr__(self) -> str:
+        return (
+            f"<SpillableBufferSlice size={format_bytes(self._size)} "
+            f"offset={format_bytes(self._offset)} of {self._base} "
+        )
+
+    # The rest of the methods delegate to the base buffer.
+    def spill(self, target: str = "cpu") -> None:
+        return self._base.spill(target=target)
+
+    @property
+    def is_spilled(self) -> bool:
+        return self._base.is_spilled
+
+    @property
+    def exposed(self) -> bool:
+        return self._base.exposed
+
+    @property
+    def spillable(self) -> bool:
+        return self._base.spillable
+
+    def spill_lock(self, spill_lock: SpillLock) -> None:
+        self._base.spill_lock(spill_lock=spill_lock)
diff --git a/python/cudf/cudf/core/buffer/utils.py b/python/cudf/cudf/core/buffer/utils.py
new file mode 100644
index 00000000000..71d8ce9853c
--- /dev/null
+++ b/python/cudf/cudf/core/buffer/utils.py
@@ -0,0 +1,136 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+from __future__ import annotations
+
+import threading
+from contextlib import ContextDecorator
+from typing import Any, Dict, Optional, Tuple, Union
+
+from cudf.core.buffer.buffer import Buffer, cuda_array_interface_wrapper
+from cudf.core.buffer.spill_manager import get_global_manager
+from cudf.core.buffer.spillable_buffer import SpillableBuffer, SpillLock
+
+
+def as_buffer(
+    data: Union[int, Any],
+    *,
+    size: int = None,
+    owner: object = None,
+    exposed: bool = False,
+) -> Buffer:
+    """Factory function to wrap `data` in a Buffer object.
+
+    If `data` isn't a buffer already, a new buffer that points to the memory of
+    `data` is created. If `data` represents host memory, it is copied to a new
+    `rmm.DeviceBuffer` device allocation. Otherwise, the memory of `data` is
+    **not** copied, instead the new buffer keeps a reference to `data` in order
+    to retain its lifetime.
+
+    If `data` is an integer, it is assumed to point to device memory.
+
+    Raises ValueError if data isn't C-contiguous.
+
+    Parameters
+    ----------
+    data : int or buffer-like or array-like
+        An integer representing a pointer to device memory or a buffer-like
+        or array-like object. When not an integer, `size` and `owner` must
+        be None.
+    size : int, optional
+        Size of device memory in bytes. Must be specified if `data` is an
+        integer.
+    owner : object, optional
+        Python object to which the lifetime of the memory allocation is tied.
+        A reference to this object is kept in the returned Buffer.
+    exposed : bool, optional
+        Mark the buffer as permanently exposed (unspillable). This is ignored
+        unless spilling is enabled and the data represents device memory, see
+        SpillableBuffer.
+
+    Return
+    ------
+    Buffer
+        A buffer instance that represents the device memory of `data`.
+    """
+
+    if isinstance(data, Buffer):
+        return data
+
+    # We handle the integer argument in the factory function by wrapping
+    # the pointer in a `__cuda_array_interface__` exposing object so that
+    # the Buffer (and its sub-classes) do not have to.
+    if isinstance(data, int):
+        if size is None:
+            raise ValueError(
+                "size must be specified when `data` is an integer"
+            )
+        data = cuda_array_interface_wrapper(ptr=data, size=size, owner=owner)
+    elif size is not None or owner is not None:
+        raise ValueError(
+            "`size` and `owner` must be None when "
+            "`data` is a buffer-like or array-like object"
+        )
+
+    if get_global_manager() is not None:
+        if hasattr(data, "__cuda_array_interface__"):
+            return SpillableBuffer._from_device_memory(data, exposed=exposed)
+        if exposed:
+            raise ValueError("cannot created exposed host memory")
+        return SpillableBuffer._from_host_memory(data)
+
+    if hasattr(data, "__cuda_array_interface__"):
+        return Buffer._from_device_memory(data)
+    return Buffer._from_host_memory(data)
+
+
+_thread_spill_locks: Dict[int, Tuple[Optional[SpillLock], int]] = {}
+
+
+def _push_thread_spill_lock() -> None:
+    _id = threading.get_ident()
+    spill_lock, count = _thread_spill_locks.get(_id, (None, 0))
+    if spill_lock is None:
+        spill_lock = SpillLock()
+    _thread_spill_locks[_id] = (spill_lock, count + 1)
+
+
+def _pop_thread_spill_lock() -> None:
+    _id = threading.get_ident()
+    spill_lock, count = _thread_spill_locks[_id]
+    if count == 1:
+        spill_lock = None
+    _thread_spill_locks[_id] = (spill_lock, count - 1)
+
+
+class acquire_spill_lock(ContextDecorator):
+    """Decorator and context to set spill lock automatically.
+
+    All calls to `get_spill_lock()` within the decorated function or context
+    will return a spill lock with a lifetime bound to the function or context.
+
+    Developer Notes
+    ---------------
+    We use the global variable `_thread_spill_locks` to track the global spill
+    lock state. To support concurrency, each thread tracks its own state by
+    pushing and poping from `_thread_spill_locks` using its thread ID.
+    """
+
+    def __enter__(self) -> Optional[SpillLock]:
+        _push_thread_spill_lock()
+        return get_spill_lock()
+
+    def __exit__(self, *exc):
+        _pop_thread_spill_lock()
+
+
+def get_spill_lock() -> Union[SpillLock, None]:
+    """Return a spill lock within the context of `acquire_spill_lock` or None
+
+    Returns None, if spilling is disabled.
+    """
+
+    if get_global_manager() is None:
+        return None
+    _id = threading.get_ident()
+    spill_lock, _ = _thread_spill_locks.get(_id, (None, 0))
+    return spill_lock
diff --git a/python/cudf/cudf/core/column/categorical.py b/python/cudf/cudf/core/column/categorical.py
index 601ad707ba6..322092a149c 100644
--- a/python/cudf/cudf/core/column/categorical.py
+++ b/python/cudf/cudf/core/column/categorical.py
@@ -16,7 +16,7 @@
 from cudf._lib.transform import bools_to_mask
 from cudf._typing import ColumnBinaryOperand, ColumnLike, Dtype, ScalarLike
 from cudf.api.types import is_categorical_dtype, is_interval_dtype
-from cudf.core.buffer import DeviceBufferLike
+from cudf.core.buffer import Buffer
 from cudf.core.column import column
 from cudf.core.column.methods import ColumnMethods
 from cudf.core.dtypes import CategoricalDtype
@@ -104,7 +104,7 @@ def __init__(self, parent: SeriesOrSingleColumnIndex):
         super().__init__(parent=parent)
 
     @property
-    def categories(self) -> "cudf.core.index.BaseIndex":
+    def categories(self) -> "cudf.core.index.GenericIndex":
         """
         The categories of this categorical.
         """
@@ -595,7 +595,7 @@ class CategoricalColumn(column.ColumnBase):
     Parameters
     ----------
     dtype : CategoricalDtype
-    mask : DeviceBufferLike
+    mask : Buffer
         The validity mask
     offset : int
         Data offset
@@ -619,7 +619,7 @@ class CategoricalColumn(column.ColumnBase):
     def __init__(
         self,
         dtype: CategoricalDtype,
-        mask: DeviceBufferLike = None,
+        mask: Buffer = None,
         size: int = None,
         offset: int = 0,
         null_count: int = None,
@@ -678,7 +678,7 @@ def _process_values_for_isin(
         rhs = cudf.core.column.as_column(values, dtype=self.dtype)
         return lhs, rhs
 
-    def set_base_mask(self, value: Optional[DeviceBufferLike]):
+    def set_base_mask(self, value: Optional[Buffer]):
         super().set_base_mask(value)
         self._codes = None
 
diff --git a/python/cudf/cudf/core/column/column.py b/python/cudf/cudf/core/column/column.py
index 66ae984ee81..a51703ae57e 100644
--- a/python/cudf/cudf/core/column/column.py
+++ b/python/cudf/cudf/core/column/column.py
@@ -64,7 +64,7 @@
 )
 from cudf.core._compat import PANDAS_GE_150
 from cudf.core.abc import Serializable
-from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like
+from cudf.core.buffer import Buffer, as_buffer
 from cudf.core.dtypes import (
     CategoricalDtype,
     IntervalDtype,
@@ -357,7 +357,7 @@ def valid_count(self) -> int:
         return len(self) - self.null_count
 
     @property
-    def nullmask(self) -> DeviceBufferLike:
+    def nullmask(self) -> Buffer:
         """The gpu buffer for the null-mask"""
         if not self.nullable:
             raise ValueError("Column has no null mask")
@@ -502,7 +502,8 @@ def _wrap_binop_normalization(self, other):
         if other is NA or other is None:
             return cudf.Scalar(other, dtype=self.dtype)
         if isinstance(other, np.ndarray) and other.ndim == 0:
-            other = other.item()
+            # Try and maintain the dtype
+            other = other.dtype.type(other.item())
         return self.normalize_binop_value(other)
 
     def _scatter_by_slice(
@@ -571,21 +572,14 @@ def _scatter_by_column(
 
         self._check_scatter_key_length(num_keys, value)
 
-        try:
-            if is_bool_dtype(key.dtype):
-                return libcudf.copying.boolean_mask_scatter(
-                    [value], [self], key
-                )[0]._with_type_metadata(self.dtype)
-            else:
-                return libcudf.copying.scatter([value], key, [self])[
-                    0
-                ]._with_type_metadata(self.dtype)
-        except RuntimeError as e:
-            if "out of bounds" in str(e):
-                raise IndexError(
-                    f"index out of bounds for column of size {len(self)}"
-                ) from e
-            raise
+        if is_bool_dtype(key.dtype):
+            return libcudf.copying.boolean_mask_scatter([value], [self], key)[
+                0
+            ]._with_type_metadata(self.dtype)
+        else:
+            return libcudf.copying.scatter([value], key, [self])[
+                0
+            ]._with_type_metadata(self.dtype)
 
     def _check_scatter_key_length(
         self, num_keys: int, value: Union[cudf.core.scalar.Scalar, ColumnBase]
@@ -768,12 +762,12 @@ def _obtain_isin_result(self, rhs: ColumnBase) -> ColumnBase:
         res = res.drop_duplicates(subset="orig_order", ignore_index=True)
         return res._data["bool"].fillna(False)
 
-    def as_mask(self) -> DeviceBufferLike:
+    def as_mask(self) -> Buffer:
         """Convert booleans to bitmask
 
         Returns
         -------
-        DeviceBufferLike
+        Buffer
         """
 
         if self.has_nulls():
@@ -854,6 +848,8 @@ def can_cast_safely(self, to_dtype: Dtype) -> bool:
         raise NotImplementedError()
 
     def astype(self, dtype: Dtype, **kwargs) -> ColumnBase:
+        if self.dtype == dtype:
+            return self
         if is_categorical_dtype(dtype):
             return self.as_categorical_column(dtype, **kwargs)
 
@@ -1288,7 +1284,7 @@ def column_empty(
         data = None
         children = (
             build_column(
-                data=as_device_buffer_like(
+                data=as_buffer(
                     rmm.DeviceBuffer(
                         size=row_count * cudf.dtype("int32").itemsize
                     )
@@ -1301,7 +1297,7 @@ def column_empty(
         children = (
             full(row_count + 1, 0, dtype="int32"),
             build_column(
-                data=as_device_buffer_like(
+                data=as_buffer(
                     rmm.DeviceBuffer(
                         size=row_count * cudf.dtype("int8").itemsize
                     )
@@ -1310,9 +1306,7 @@ def column_empty(
             ),
         )
     else:
-        data = as_device_buffer_like(
-            rmm.DeviceBuffer(size=row_count * dtype.itemsize)
-        )
+        data = as_buffer(rmm.DeviceBuffer(size=row_count * dtype.itemsize))
 
     if masked:
         mask = create_null_mask(row_count, state=MaskState.ALL_NULL)
@@ -1325,11 +1319,11 @@ def column_empty(
 
 
 def build_column(
-    data: Union[DeviceBufferLike, None],
+    data: Union[Buffer, None],
     dtype: Dtype,
     *,
     size: int = None,
-    mask: DeviceBufferLike = None,
+    mask: Buffer = None,
     offset: int = 0,
     null_count: int = None,
     children: Tuple[ColumnBase, ...] = (),
@@ -1339,12 +1333,12 @@ def build_column(
 
     Parameters
     ----------
-    data : DeviceBufferLike
+    data : Buffer
         The data buffer (can be None if constructing certain Column
         types like StringColumn, ListColumn, or CategoricalColumn)
     dtype
         The dtype associated with the Column to construct
-    mask : DeviceBufferLike, optional
+    mask : Buffer, optional
         The mask buffer
     size : int, optional
     offset : int, optional
@@ -1489,7 +1483,7 @@ def build_column(
 def build_categorical_column(
     categories: ColumnBase,
     codes: ColumnBase,
-    mask: DeviceBufferLike = None,
+    mask: Buffer = None,
     size: int = None,
     offset: int = 0,
     null_count: int = None,
@@ -1505,7 +1499,7 @@ def build_categorical_column(
     codes : Column
         Column of codes, the size of the resulting Column will be
         the size of `codes`
-    mask : DeviceBufferLike
+    mask : Buffer
         Null mask
     size : int, optional
     offset : int, optional
@@ -1549,7 +1543,7 @@ def build_interval_column(
         Column of values representing the left of the interval
     right_col : Column
         Column of representing the right of the interval
-    mask : DeviceBufferLike
+    mask : Buffer
         Null mask
     size : int, optional
     offset : int, optional
@@ -1580,7 +1574,7 @@ def build_interval_column(
 def build_list_column(
     indices: ColumnBase,
     elements: ColumnBase,
-    mask: DeviceBufferLike = None,
+    mask: Buffer = None,
     size: int = None,
     offset: int = 0,
     null_count: int = None,
@@ -1594,7 +1588,7 @@ def build_list_column(
         Column of list indices
     elements : ColumnBase
         Column of list elements
-    mask: DeviceBufferLike
+    mask: Buffer
         Null mask
     size: int, optional
     offset: int, optional
@@ -1626,7 +1620,7 @@ def build_struct_column(
     names: Sequence[str],
     children: Tuple[ColumnBase, ...],
     dtype: Optional[Dtype] = None,
-    mask: DeviceBufferLike = None,
+    mask: Buffer = None,
     size: int = None,
     offset: int = 0,
     null_count: int = None,
@@ -1640,7 +1634,7 @@ def build_struct_column(
         Field names to map to children dtypes, must be strings.
     children : tuple
 
-    mask: DeviceBufferLike
+    mask: Buffer
         Null mask
     size: int, optional
     offset: int, optional
@@ -1676,9 +1670,7 @@ def _make_copy_replacing_NaT_with_null(column):
     out_col = cudf._lib.replace.replace(
         column,
         build_column(
-            as_device_buffer_like(
-                np.array([na_value], dtype=column.dtype).view("|u1")
-            ),
+            as_buffer(np.array([na_value], dtype=column.dtype).view("|u1")),
             dtype=column.dtype,
         ),
         null,
@@ -1773,7 +1765,7 @@ def as_column(
         ):
             arbitrary = cupy.ascontiguousarray(arbitrary)
 
-        data = as_device_buffer_like(arbitrary)
+        data = as_buffer(arbitrary, exposed=True)
         col = build_column(data, dtype=current_dtype, mask=mask)
 
         if dtype is not None:
@@ -1921,7 +1913,7 @@ def as_column(
             if cast_dtype:
                 arbitrary = arbitrary.astype(cudf.dtype("datetime64[s]"))
 
-            buffer = as_device_buffer_like(arbitrary.view("|u1"))
+            buffer = as_buffer(arbitrary.view("|u1"))
             mask = None
             if nan_as_null is None or nan_as_null is True:
                 data = build_column(buffer, dtype=arbitrary.dtype)
@@ -1939,7 +1931,7 @@ def as_column(
             if cast_dtype:
                 arbitrary = arbitrary.astype(cudf.dtype("timedelta64[s]"))
 
-            buffer = as_device_buffer_like(arbitrary.view("|u1"))
+            buffer = as_buffer(arbitrary.view("|u1"))
             mask = None
             if nan_as_null is None or nan_as_null is True:
                 data = build_column(buffer, dtype=arbitrary.dtype)
@@ -2218,7 +2210,7 @@ def _construct_array(
     return arbitrary
 
 
-def _mask_from_cuda_array_interface_desc(obj) -> Union[DeviceBufferLike, None]:
+def _mask_from_cuda_array_interface_desc(obj) -> Union[Buffer, None]:
     desc = obj.__cuda_array_interface__
     mask = desc.get("mask", None)
 
@@ -2230,7 +2222,7 @@ def _mask_from_cuda_array_interface_desc(obj) -> Union[DeviceBufferLike, None]:
         typecode = typestr[1]
         if typecode == "t":
             mask_size = bitmask_allocation_size_bytes(nelem)
-            mask = Buffer(data=ptr, size=mask_size, owner=obj)
+            mask = as_buffer(data=ptr, size=mask_size, owner=obj, exposed=True)
         elif typecode == "b":
             col = as_column(mask)
             mask = bools_to_mask(col)
diff --git a/python/cudf/cudf/core/column/datetime.py b/python/cudf/cudf/core/column/datetime.py
index 1419b14e8c6..56436ac141d 100644
--- a/python/cudf/cudf/core/column/datetime.py
+++ b/python/cudf/cudf/core/column/datetime.py
@@ -6,7 +6,6 @@
 import locale
 import re
 from locale import nl_langinfo
-from types import SimpleNamespace
 from typing import Any, Mapping, Sequence, cast
 
 import numpy as np
@@ -23,7 +22,7 @@
 )
 from cudf.api.types import is_datetime64_dtype, is_scalar, is_timedelta64_dtype
 from cudf.core._compat import PANDAS_GE_120
-from cudf.core.buffer import DeviceBufferLike
+from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
 from cudf.core.column import ColumnBase, as_column, column, string
 from cudf.core.column.timedelta import _unit_to_nanoseconds_conversion
 from cudf.utils.utils import _fillna_natwise
@@ -98,11 +97,11 @@ class DatetimeColumn(column.ColumnBase):
 
     Parameters
     ----------
-    data : DeviceBufferLike
+    data : Buffer
         The datetime values
     dtype : np.dtype
         The data type
-    mask : DeviceBufferLike; optional
+    mask : Buffer; optional
         The validity mask
     """
 
@@ -121,9 +120,9 @@ class DatetimeColumn(column.ColumnBase):
 
     def __init__(
         self,
-        data: DeviceBufferLike,
+        data: Buffer,
         dtype: DtypeObj,
-        mask: DeviceBufferLike = None,
+        mask: Buffer = None,
         size: int = None,  # TODO: make non-optional
         offset: int = 0,
         null_count: int = None,
@@ -131,9 +130,7 @@ def __init__(
         dtype = cudf.dtype(dtype)
 
         if data.size % dtype.itemsize:
-            raise ValueError(
-                "DeviceBufferLike size must be divisible by element size"
-            )
+            raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = data.size // dtype.itemsize
             size = size - offset
@@ -291,20 +288,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
         }
 
         if self.nullable and self.has_nulls():
-
             # Create a simple Python object that exposes the
             # `__cuda_array_interface__` attribute here since we need to modify
             # some of the attributes from the numba device array
-            mask = SimpleNamespace(
-                __cuda_array_interface__={
-                    "shape": (len(self),),
-                    "typestr": "<t1",
-                    "data": (self.mask_ptr, True),
-                    "version": 1,
-                }
+            output["mask"] = cuda_array_interface_wrapper(
+                ptr=self.mask_ptr,
+                size=len(self),
+                owner=self.mask,
+                readonly=True,
+                typestr="<t1",
             )
-            output["mask"] = mask
-
         return output
 
     def as_datetime_column(self, dtype: Dtype, **kwargs) -> DatetimeColumn:
diff --git a/python/cudf/cudf/core/column/decimal.py b/python/cudf/cudf/core/column/decimal.py
index e03802e6d8c..157bc1f4291 100644
--- a/python/cudf/cudf/core/column/decimal.py
+++ b/python/cudf/cudf/core/column/decimal.py
@@ -16,7 +16,7 @@
 )
 from cudf._typing import ColumnBinaryOperand, Dtype
 from cudf.api.types import is_integer_dtype, is_scalar
-from cudf.core.buffer import as_device_buffer_like
+from cudf.core.buffer import as_buffer
 from cudf.core.column import ColumnBase, as_column
 from cudf.core.dtypes import (
     Decimal32Dtype,
@@ -79,12 +79,14 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
 
         # Binary Arithmetics between decimal columns. `Scale` and `precision`
         # are computed outside of libcudf
+        unsupported_msg = (
+            f"{op} not supported for the following dtypes: "
+            f"{self.dtype}, {other.dtype}"
+        )
         try:
             if op in {"__add__", "__sub__", "__mul__", "__div__"}:
-                output_type = _get_decimal_type(self.dtype, other.dtype, op)
-                result = libcudf.binaryop.binaryop(
-                    self, other, op, output_type
-                )
+                output_type = _get_decimal_type(lhs.dtype, rhs.dtype, op)
+                result = libcudf.binaryop.binaryop(lhs, rhs, op, output_type)
                 # TODO:  Why is this necessary? Why isn't the result's
                 # precision already set correctly based on output_type?
                 result.dtype.precision = output_type.precision
@@ -96,12 +98,12 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str):
                 "__le__",
                 "__ge__",
             }:
-                result = libcudf.binaryop.binaryop(self, other, op, bool)
+                result = libcudf.binaryop.binaryop(lhs, rhs, op, bool)
+            else:
+                raise TypeError(unsupported_msg)
         except RuntimeError as e:
             if "Unsupported operator for these types" in str(e):
-                raise NotImplementedError(
-                    f"{op} not supported for types with different bit-widths"
-                ) from e
+                raise TypeError(unsupported_msg) from e
             raise
 
         return result
@@ -203,7 +205,7 @@ def from_arrow(cls, data: pa.Array):
         data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int32"))
         data_32 = data_128[::4].copy()
         return cls(
-            data=as_device_buffer_like(data_32.view("uint8")),
+            data=as_buffer(data_32.view("uint8")),
             size=len(data),
             dtype=dtype,
             offset=data.offset,
@@ -290,7 +292,7 @@ def from_arrow(cls, data: pa.Array):
         data_128 = cp.array(np.frombuffer(data.buffers()[1]).view("int64"))
         data_64 = data_128[::2].copy()
         return cls(
-            data=as_device_buffer_like(data_64.view("uint8")),
+            data=as_buffer(data_64.view("uint8")),
             size=len(data),
             dtype=dtype,
             offset=data.offset,
@@ -399,4 +401,12 @@ def _get_decimal_type(lhs_dtype, rhs_dtype, op):
                 # to try the next dtype
                 continue
 
-    raise OverflowError("Maximum supported decimal type is Decimal128")
+    # Instead of raising an overflow error, we create a `Decimal128Dtype`
+    # with max possible scale & precision, see example of this demonstration
+    # here: https://learn.microsoft.com/en-us/sql/t-sql/data-types/
+    # precision-scale-and-length-transact-sql?view=sql-server-ver16#examples
+    scale = min(
+        scale, cudf.Decimal128Dtype.MAX_PRECISION - (precision - scale)
+    )
+    precision = min(cudf.Decimal128Dtype.MAX_PRECISION, max_precision)
+    return cudf.Decimal128Dtype(precision=precision, scale=scale)
diff --git a/python/cudf/cudf/core/column/numerical.py b/python/cudf/cudf/core/column/numerical.py
index a66c11c8bdc..7943135afe1 100644
--- a/python/cudf/cudf/core/column/numerical.py
+++ b/python/cudf/cudf/core/column/numerical.py
@@ -2,7 +2,6 @@
 
 from __future__ import annotations
 
-from types import SimpleNamespace
 from typing import (
     Any,
     Callable,
@@ -36,7 +35,7 @@
     is_number,
     is_scalar,
 )
-from cudf.core.buffer import DeviceBufferLike, as_device_buffer_like
+from cudf.core.buffer import Buffer, cuda_array_interface_wrapper
 from cudf.core.column import (
     ColumnBase,
     as_column,
@@ -66,10 +65,10 @@ class NumericalColumn(NumericalBaseColumn):
 
     Parameters
     ----------
-    data : DeviceBufferLike
+    data : Buffer
     dtype : np.dtype
-        The dtype associated with the data DeviceBufferLike
-    mask : DeviceBufferLike, optional
+        The dtype associated with the data Buffer
+    mask : Buffer, optional
     """
 
     _nan_count: Optional[int]
@@ -77,9 +76,9 @@ class NumericalColumn(NumericalBaseColumn):
 
     def __init__(
         self,
-        data: DeviceBufferLike,
+        data: Buffer,
         dtype: DtypeObj,
-        mask: DeviceBufferLike = None,
+        mask: Buffer = None,
         size: int = None,  # TODO: make this non-optional
         offset: int = 0,
         null_count: int = None,
@@ -87,9 +86,7 @@ def __init__(
         dtype = cudf.dtype(dtype)
 
         if data.size % dtype.itemsize:
-            raise ValueError(
-                "DeviceBufferLike size must be divisible by element size"
-            )
+            raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = (data.size // dtype.itemsize) - offset
         self._nan_count = None
@@ -177,19 +174,16 @@ def __cuda_array_interface__(self) -> Mapping[str, Any]:
         }
 
         if self.nullable and self.has_nulls():
-
             # Create a simple Python object that exposes the
             # `__cuda_array_interface__` attribute here since we need to modify
             # some of the attributes from the numba device array
-            mask = SimpleNamespace(
-                __cuda_array_interface__={
-                    "shape": (len(self),),
-                    "typestr": "<t1",
-                    "data": (self.mask_ptr, True),
-                    "version": 1,
-                }
+            output["mask"] = cuda_array_interface_wrapper(
+                ptr=self.mask_ptr,
+                size=len(self),
+                owner=self.mask,
+                readonly=True,
+                typestr="<t1",
             )
-            output["mask"] = mask
 
         return output
 
@@ -231,10 +225,18 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                     (tmp.dtype.type in int_float_dtype_mapping)
                     and (tmp.dtype.type != np.bool_)
                     and (
-                        (np.isscalar(tmp) and (0 == tmp))
-                        or (
-                            (isinstance(tmp, NumericalColumn)) and (0.0 in tmp)
+                        (
+                            (
+                                np.isscalar(tmp)
+                                or (
+                                    isinstance(tmp, cudf.Scalar)
+                                    # host to device copy
+                                    and tmp.is_valid()
+                                )
+                            )
+                            and (0 == tmp)
                         )
+                        or ((isinstance(tmp, NumericalColumn)) and (0 in tmp))
                     )
                 ):
                     out_dtype = cudf.dtype("float64")
@@ -280,7 +282,7 @@ def nans_to_nulls(self: NumericalColumn) -> NumericalColumn:
 
     def normalize_binop_value(
         self, other: ScalarLike
-    ) -> Union[ColumnBase, ScalarLike]:
+    ) -> Union[ColumnBase, cudf.Scalar]:
         if isinstance(other, ColumnBase):
             if not isinstance(other, NumericalColumn):
                 return NotImplemented
@@ -291,25 +293,24 @@ def normalize_binop_value(
             # expensive device-host transfer just to
             # adjust the dtype
             other = other.value
-        other_dtype = np.min_scalar_type(other)
-        if other_dtype.kind in {"b", "i", "u", "f"}:
-            if isinstance(other, cudf.Scalar):
-                return other
-            other_dtype = np.promote_types(self.dtype, other_dtype)
-            if other_dtype == np.dtype("float16"):
-                other_dtype = cudf.dtype("float32")
-                other = other_dtype.type(other)
+        # Try and match pandas and hence numpy. Deduce the common
+        # dtype via the _value_ of other, and the dtype of self. TODO:
+        # When NEP50 is accepted, this might want changed or
+        # simplified.
+        # This is not at all simple:
+        # np.result_type(np.int64(0), np.uint8)
+        #   => np.uint8
+        # np.result_type(np.asarray([0], dtype=np.int64), np.uint8)
+        #   => np.int64
+        # np.promote_types(np.int64(0), np.uint8)
+        #   => np.int64
+        # np.promote_types(np.asarray([0], dtype=np.int64).dtype, np.uint8)
+        #   => np.int64
+        common_dtype = np.result_type(self.dtype, other)
+        if common_dtype.kind in {"b", "i", "u", "f"}:
             if self.dtype.kind == "b":
-                other_dtype = min_signed_type(other)
-            if np.isscalar(other):
-                return cudf.dtype(other_dtype).type(other)
-            else:
-                ary = full(len(self), other, dtype=other_dtype)
-                return column.build_column(
-                    data=as_device_buffer_like(ary),
-                    dtype=ary.dtype,
-                    mask=self.mask,
-                )
+                common_dtype = min_signed_type(other)
+            return cudf.Scalar(other, dtype=common_dtype)
         else:
             return NotImplemented
 
diff --git a/python/cudf/cudf/core/column/string.py b/python/cudf/cudf/core/column/string.py
index 287e68531f8..f58180ff5dd 100644
--- a/python/cudf/cudf/core/column/string.py
+++ b/python/cudf/cudf/core/column/string.py
@@ -33,7 +33,7 @@
     is_scalar,
     is_string_dtype,
 )
-from cudf.core.buffer import DeviceBufferLike
+from cudf.core.buffer import Buffer
 from cudf.core.column import column, datetime
 from cudf.core.column.column import ColumnBase
 from cudf.core.column.methods import ColumnMethods
@@ -116,8 +116,8 @@ class StringMethods(ColumnMethods):
 
     This mimics pandas ``df.str`` interface. nulls stay null
     unless handled otherwise by a particular method.
-    Patterned after Python’s string methods, with some
-    inspiration from R’s stringr package.
+    Patterned after Python's string methods, with some
+    inspiration from R's stringr package.
     """
 
     _column: StringColumn
@@ -709,7 +709,7 @@ def contains(
         >>> idx.str.contains('23', regex=False)
         GenericIndex([False, False, False, True, <NA>], dtype='bool')
 
-        Returning ‘house’ or ‘dog’ when either expression occurs in a string.
+        Returning 'house' or 'dog' when either expression occurs in a string.
 
         >>> s1.str.contains('house|dog', regex=True)
         0    False
@@ -732,7 +732,7 @@ def contains(
         Ensure ``pat`` is a not a literal pattern when ``regex`` is set
         to True. Note in the following example one might expect
         only `s2[1]` and `s2[3]` to return True. However,
-        ‘.0’ as a regex matches any character followed by a 0.
+        '.0' as a regex matches any character followed by a 0.
 
         >>> s2 = cudf.Series(['40', '40.0', '41', '41.0', '35'])
         >>> s2.str.contains('.0', regex=True)
@@ -2903,7 +2903,7 @@ def pad(
             additional characters will be filled with
             character defined in fillchar.
 
-        side : {‘left’, ‘right’, ‘both’}, default ‘left’
+        side : {'left', 'right', 'both'}, default 'left'
             Side from which to fill resulting string.
 
         fillchar : str,  default ' ' (whitespace)
@@ -2930,7 +2930,7 @@ def pad(
             Equivalent to ``Series.str.pad(side='both')``.
 
         zfill
-            Pad strings in the Series/Index by prepending ‘0’ character.
+            Pad strings in the Series/Index by prepending '0' character.
             Equivalent to ``Series.str.pad(side='left', fillchar='0')``.
 
         Examples
@@ -2970,7 +2970,7 @@ def pad(
             side = libstrings.SideType[side.upper()]
         except KeyError:
             raise ValueError(
-                "side has to be either one of {‘left’, ‘right’, ‘both’}"
+                "side has to be either one of {'left', 'right', 'both'}"
             )
 
         return self._return_or_inplace(
@@ -2979,9 +2979,9 @@ def pad(
 
     def zfill(self, width: int) -> SeriesOrIndex:
         """
-        Pad strings in the Series/Index by prepending ‘0’ characters.
+        Pad strings in the Series/Index by prepending '0' characters.
 
-        Strings in the Series/Index are padded with ‘0’ characters
+        Strings in the Series/Index are padded with '0' characters
         on the left of the string to reach a total string length
         width. Strings in the Series/Index with length greater
         or equal to width are unchanged.
@@ -2994,12 +2994,12 @@ def zfill(self, width: int) -> SeriesOrIndex:
         width : int
             Minimum length of resulting string;
             strings with length less than width
-            be prepended with ‘0’ characters.
+            be prepended with '0' characters.
 
         Returns
         -------
         Series/Index of str dtype
-            Returns Series or Index with prepended ‘0’ characters.
+            Returns Series or Index with prepended '0' characters.
 
         See Also
         --------
@@ -3405,7 +3405,7 @@ def wrap(self, width: int, **kwargs) -> SeriesOrIndex:
         `expand_tabsbool` are not yet supported and will raise a
         NotImplementedError if they are set to any value.
 
-        This method currently achieves behavior matching R’s
+        This method currently achieves behavior matching R's
         stringr library ``str_wrap`` function, the equivalent
         pandas implementation can be obtained using the
         following parameter setting:
@@ -3576,7 +3576,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         >>> import cudf
         >>> s = cudf.Series(['Lion', 'Monkey', 'Rabbit'])
 
-        The search for the pattern ‘Monkey’ returns one match:
+        The search for the pattern 'Monkey' returns one match:
 
         >>> s.str.findall('Monkey')
         0          []
@@ -3595,7 +3595,7 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
 
         Regular expressions are supported too. For instance,
         the search for all the strings ending with
-        the word ‘on’ is shown next:
+        the word 'on' is shown next:
 
         >>> s.str.findall('on$')
         0    [on]
@@ -3623,6 +3623,70 @@ def findall(self, pat: str, flags: int = 0) -> SeriesOrIndex:
         data = libstrings.findall(self._column, pat, flags)
         return self._return_or_inplace(data)
 
+    def find_multiple(self, patterns: SeriesOrIndex) -> "cudf.Series":
+        """
+        Find all first occurrences of patterns in the Series/Index.
+
+        Parameters
+        ----------
+        patterns : array-like, Sequence or Series
+            Patterns to search for in the given Series/Index.
+
+        Returns
+        -------
+        Series
+            A Series with a list of indices of each pattern's first occurrence.
+            If a pattern is not found, -1 is returned for that index.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series(["strings", "to", "search", "in"])
+        >>> s
+        0    strings
+        1         to
+        2     search
+        3         in
+        dtype: object
+        >>> t = cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"])
+        >>> t
+        0         a
+        1    string
+        2         g
+        3       inn
+        4         o
+        5         r
+        6       sea
+        dtype: object
+        >>> s.str.find_multiple(t)
+        0       [-1, 0, 5, -1, -1, 2, -1]
+        1     [-1, -1, -1, -1, 1, -1, -1]
+        2       [2, -1, -1, -1, -1, 3, 0]
+        3    [-1, -1, -1, -1, -1, -1, -1]
+        dtype: list
+        """
+        if can_convert_to_column(patterns):
+            patterns_column = column.as_column(patterns)
+        else:
+            raise TypeError(
+                "patterns should be an array-like or a Series object, "
+                f"found {type(patterns)}"
+            )
+
+        if not isinstance(patterns_column, StringColumn):
+            raise TypeError(
+                "patterns can only be of 'string' dtype, "
+                f"got: {patterns_column.dtype}"
+            )
+
+        return cudf.Series(
+            libstrings.find_multiple(self._column, patterns_column),
+            index=self._parent.index
+            if isinstance(self._parent, cudf.Series)
+            else self._parent,
+            name=self._parent.name,
+        )
+
     def isempty(self) -> SeriesOrIndex:
         """
         Check whether each string is an empty string.
@@ -4164,7 +4228,7 @@ def url_encode(self) -> SeriesOrIndex:
         Returns a URL-encoded format of each string.
         No format checking is performed.
         All characters are encoded except for ASCII letters,
-        digits, and these characters: ``‘.’,’_’,’-‘,’~’``.
+        digits, and these characters: ``'.','_','-','~'``.
         Encoding converts to hex using UTF-8 encoded bytes.
 
         Returns
@@ -5109,7 +5173,7 @@ class StringColumn(column.ColumnBase):
 
     Parameters
     ----------
-    mask : DeviceBufferLike
+    mask : Buffer
         The validity mask
     offset : int
         Data offset
@@ -5143,7 +5207,7 @@ class StringColumn(column.ColumnBase):
 
     def __init__(
         self,
-        mask: DeviceBufferLike = None,
+        mask: Buffer = None,
         size: int = None,  # TODO: make non-optional
         offset: int = 0,
         null_count: int = None,
diff --git a/python/cudf/cudf/core/column/struct.py b/python/cudf/cudf/core/column/struct.py
index 67ff3e48dbd..69d70cf427f 100644
--- a/python/cudf/cudf/core/column/struct.py
+++ b/python/cudf/cudf/core/column/struct.py
@@ -1,6 +1,8 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 from __future__ import annotations
 
+from functools import cached_property
+
 import pandas as pd
 import pyarrow as pa
 
@@ -65,6 +67,17 @@ def to_pandas(self, index: pd.Index = None, **kwargs) -> "pd.Series":
             pd_series.index = index
         return pd_series
 
+    @cached_property
+    def memory_usage(self):
+        n = 0
+        if self.nullable:
+            n += cudf._lib.null_mask.bitmask_allocation_size_bytes(self.size)
+
+        for child in self.children:
+            n += child.memory_usage
+
+        return n
+
     def element_indexing(self, index: int):
         result = super().element_indexing(index)
         return {
diff --git a/python/cudf/cudf/core/column/timedelta.py b/python/cudf/cudf/core/column/timedelta.py
index e6d688014fa..901547d94a9 100644
--- a/python/cudf/cudf/core/column/timedelta.py
+++ b/python/cudf/cudf/core/column/timedelta.py
@@ -13,7 +13,7 @@
 from cudf import _lib as libcudf
 from cudf._typing import ColumnBinaryOperand, DatetimeLikeScalar, Dtype
 from cudf.api.types import is_scalar, is_timedelta64_dtype
-from cudf.core.buffer import DeviceBufferLike
+from cudf.core.buffer import Buffer
 from cudf.core.column import ColumnBase, column, string
 from cudf.utils.dtypes import np_to_pa_dtype
 from cudf.utils.utils import _fillna_natwise
@@ -40,13 +40,13 @@ class TimeDeltaColumn(ColumnBase):
     """
     Parameters
     ----------
-    data : DeviceBufferLike
+    data : Buffer
         The Timedelta values
     dtype : np.dtype
         The data type
     size : int
         Size of memory allocation.
-    mask : DeviceBufferLike; optional
+    mask : Buffer; optional
         The validity mask
     offset : int
         Data offset
@@ -78,19 +78,17 @@ class TimeDeltaColumn(ColumnBase):
 
     def __init__(
         self,
-        data: DeviceBufferLike,
+        data: Buffer,
         dtype: Dtype,
         size: int = None,  # TODO: make non-optional
-        mask: DeviceBufferLike = None,
+        mask: Buffer = None,
         offset: int = 0,
         null_count: int = None,
     ):
         dtype = cudf.dtype(dtype)
 
         if data.size % dtype.itemsize:
-            raise ValueError(
-                "DeviceBufferLike size must be divisible by element size"
-            )
+            raise ValueError("Buffer size must be divisible by element size")
         if size is None:
             size = data.size // dtype.itemsize
             size = size - offset
@@ -183,17 +181,17 @@ def _binaryop(self, other: ColumnBinaryOperand, op: str) -> ColumnBase:
                 out_dtype = determine_out_dtype(self.dtype, other.dtype)
             elif op in {"__truediv__", "__floordiv__"}:
                 common_dtype = determine_out_dtype(self.dtype, other.dtype)
-                this = self.astype(common_dtype).astype("float64")
+                out_dtype = np.float64 if op == "__truediv__" else np.int64
+                this = self.astype(common_dtype).astype(out_dtype)
                 if isinstance(other, cudf.Scalar):
                     if other.is_valid():
                         other = other.value.astype(common_dtype).astype(
-                            "float64"
+                            out_dtype
                         )
                     else:
-                        other = cudf.Scalar(None, "float64")
+                        other = cudf.Scalar(None, out_dtype)
                 else:
-                    other = other.astype(common_dtype).astype("float64")
-                out_dtype = np.float64 if op == "__truediv__" else np.int64
+                    other = other.astype(common_dtype).astype(out_dtype)
             elif op in {"__add__", "__sub__"}:
                 out_dtype = determine_out_dtype(self.dtype, other.dtype)
         elif other.dtype.kind in {"f", "i", "u"}:
diff --git a/python/cudf/cudf/core/dataframe.py b/python/cudf/cudf/core/dataframe.py
index f00c7d1f2b5..99fcac57306 100644
--- a/python/cudf/cudf/core/dataframe.py
+++ b/python/cudf/cudf/core/dataframe.py
@@ -298,24 +298,7 @@ def _getitem_tuple_arg(self, arg):
                     if len(df) == 0:
                         raise KeyError(arg)
 
-        # Step 3: Gather index
-        if df.shape[0] == 1:  # we have a single row
-            if isinstance(arg[0], slice):
-                start = arg[0].start
-                if start is None:
-                    start = self._frame.index[0]
-                df.index = as_index(start, name=self._frame.index.name)
-            else:
-                row_selection = as_column(arg[0])
-                if is_bool_dtype(row_selection.dtype):
-                    df.index = self._frame.index._apply_boolean_mask(
-                        row_selection
-                    )
-                else:
-                    df.index = as_index(
-                        row_selection, name=self._frame.index.name
-                    )
-        # Step 4: Downcast
+        # Step 3: Downcast
         if self._can_downcast_to_series(df, arg):
             return self._downcast_to_series(df, arg)
         return df
@@ -2009,6 +1992,247 @@ def _make_operands_and_index_for_binop(
                     operands[k] = (left_default, v, reflect, None)
         return operands, index
 
+    @classmethod
+    @_cudf_nvtx_annotate
+    def from_dict(
+        cls,
+        data: dict,
+        orient: str = "columns",
+        dtype: Dtype = None,
+        columns: list = None,
+    ) -> DataFrame:
+        """
+        Construct DataFrame from dict of array-like or dicts.
+        Creates DataFrame object from dictionary by columns or by index
+        allowing dtype specification.
+
+        Parameters
+        ----------
+        data : dict
+            Of the form {field : array-like} or {field : dict}.
+        orient : {'columns', 'index', 'tight'}, default 'columns'
+            The "orientation" of the data. If the keys of the passed dict
+            should be the columns of the resulting DataFrame, pass 'columns'
+            (default). Otherwise if the keys should be rows, pass 'index'.
+            If 'tight', assume a dict with keys ['index', 'columns', 'data',
+            'index_names', 'column_names'].
+        dtype : dtype, default None
+            Data type to force, otherwise infer.
+        columns : list, default None
+            Column labels to use when ``orient='index'``. Raises a ``ValueError``
+            if used with ``orient='columns'`` or ``orient='tight'``.
+
+        Returns
+        -------
+        DataFrame
+
+        See Also
+        --------
+        DataFrame.from_records : DataFrame from structured ndarray, sequence
+            of tuples or dicts, or DataFrame.
+        DataFrame : DataFrame object creation using constructor.
+        DataFrame.to_dict : Convert the DataFrame to a dictionary.
+
+        Examples
+        --------
+        By default the keys of the dict become the DataFrame columns:
+
+        >>> import cudf
+        >>> data = {'col_1': [3, 2, 1, 0], 'col_2': ['a', 'b', 'c', 'd']}
+        >>> cudf.DataFrame.from_dict(data)
+           col_1 col_2
+        0      3     a
+        1      2     b
+        2      1     c
+        3      0     d
+
+        Specify ``orient='index'`` to create the DataFrame using dictionary
+        keys as rows:
+
+        >>> data = {'row_1': [3, 2, 1, 0], 'row_2': [10, 11, 12, 13]}
+        >>> cudf.DataFrame.from_dict(data, orient='index')
+                0   1   2   3
+        row_1   3   2   1   0
+        row_2  10  11  12  13
+
+        When using the 'index' orientation, the column names can be
+        specified manually:
+
+        >>> cudf.DataFrame.from_dict(data, orient='index',
+        ...                          columns=['A', 'B', 'C', 'D'])
+                A   B   C   D
+        row_1   3   2   1   0
+        row_2  10  11  12  13
+
+        Specify ``orient='tight'`` to create the DataFrame using a 'tight'
+        format:
+
+        >>> data = {'index': [('a', 'b'), ('a', 'c')],
+        ...         'columns': [('x', 1), ('y', 2)],
+        ...         'data': [[1, 3], [2, 4]],
+        ...         'index_names': ['n1', 'n2'],
+        ...         'column_names': ['z1', 'z2']}
+        >>> cudf.DataFrame.from_dict(data, orient='tight')
+        z1     x  y
+        z2     1  2
+        n1 n2
+        a  b   1  3
+           c   2  4
+        """  # noqa: E501
+
+        orient = orient.lower()
+        if orient == "index":
+            if len(data) > 0 and isinstance(
+                next(iter(data.values())), (cudf.Series, cupy.ndarray)
+            ):
+                result = cls(data).T
+                result.columns = columns
+                if dtype is not None:
+                    result = result.astype(dtype)
+                return result
+            else:
+                return cls.from_pandas(
+                    pd.DataFrame.from_dict(
+                        data=data,
+                        orient=orient,
+                        dtype=dtype,
+                        columns=columns,
+                    )
+                )
+        elif orient == "columns":
+            if columns is not None:
+                raise ValueError(
+                    "Cannot use columns parameter with orient='columns'"
+                )
+            return cls(data, columns=None, dtype=dtype)
+        elif orient == "tight":
+            if columns is not None:
+                raise ValueError(
+                    "Cannot use columns parameter with orient='right'"
+                )
+
+            index = _from_dict_create_index(
+                data["index"], data["index_names"], cudf
+            )
+            columns = _from_dict_create_index(
+                data["columns"], data["column_names"], pd
+            )
+            return cls(data["data"], index=index, columns=columns, dtype=dtype)
+        else:
+            raise ValueError(
+                "Expected 'index', 'columns' or 'tight' for orient "
+                f"parameter. Got '{orient}' instead"
+            )
+
+    @_cudf_nvtx_annotate
+    def to_dict(
+        self,
+        orient: str = "dict",
+        into: type[dict] = dict,
+    ) -> dict | list[dict]:
+        """
+        Convert the DataFrame to a dictionary.
+
+        The type of the key-value pairs can be customized with the parameters
+        (see below).
+
+        Parameters
+        ----------
+        orient : str {'dict', 'list', 'series', 'split', 'tight', 'records', 'index'}
+            Determines the type of the values of the dictionary.
+
+            - 'dict' (default) : dict like {column -> {index -> value}}
+            - 'list' : dict like {column -> [values]}
+            - 'series' : dict like {column -> Series(values)}
+            - 'split' : dict like
+              {'index' -> [index], 'columns' -> [columns], 'data' -> [values]}
+            - 'tight' : dict like
+              {'index' -> [index], 'columns' -> [columns], 'data' -> [values],
+              'index_names' -> [index.names], 'column_names' -> [column.names]}
+            - 'records' : list like
+              [{column -> value}, ... , {column -> value}]
+            - 'index' : dict like {index -> {column -> value}}
+            Abbreviations are allowed. `s` indicates `series` and `sp`
+            indicates `split`.
+
+        into : class, default dict
+            The collections.abc.Mapping subclass used for all Mappings
+            in the return value.  Can be the actual class or an empty
+            instance of the mapping type you want.  If you want a
+            collections.defaultdict, you must pass it initialized.
+
+        Returns
+        -------
+        dict, list or collections.abc.Mapping
+            Return a collections.abc.Mapping object representing the DataFrame.
+            The resulting transformation depends on the `orient` parameter.
+
+        See Also
+        --------
+        DataFrame.from_dict: Create a DataFrame from a dictionary.
+        DataFrame.to_json: Convert a DataFrame to JSON format.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> df = cudf.DataFrame({'col1': [1, 2],
+        ...                      'col2': [0.5, 0.75]},
+        ...                     index=['row1', 'row2'])
+        >>> df
+              col1  col2
+        row1     1  0.50
+        row2     2  0.75
+        >>> df.to_dict()
+        {'col1': {'row1': 1, 'row2': 2}, 'col2': {'row1': 0.5, 'row2': 0.75}}
+
+        You can specify the return orientation.
+
+        >>> df.to_dict('series')
+        {'col1': row1    1
+                 row2    2
+        Name: col1, dtype: int64,
+        'col2': row1    0.50
+                row2    0.75
+        Name: col2, dtype: float64}
+
+        >>> df.to_dict('split')
+        {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
+         'data': [[1, 0.5], [2, 0.75]]}
+
+        >>> df.to_dict('records')
+        [{'col1': 1, 'col2': 0.5}, {'col1': 2, 'col2': 0.75}]
+
+        >>> df.to_dict('index')
+        {'row1': {'col1': 1, 'col2': 0.5}, 'row2': {'col1': 2, 'col2': 0.75}}
+
+        >>> df.to_dict('tight')
+        {'index': ['row1', 'row2'], 'columns': ['col1', 'col2'],
+         'data': [[1, 0.5], [2, 0.75]], 'index_names': [None], 'column_names': [None]}
+
+        You can also specify the mapping type.
+
+        >>> from collections import OrderedDict, defaultdict
+        >>> df.to_dict(into=OrderedDict)
+        OrderedDict([('col1', OrderedDict([('row1', 1), ('row2', 2)])),
+                     ('col2', OrderedDict([('row1', 0.5), ('row2', 0.75)]))])
+
+        If you want a `defaultdict`, you need to initialize it:
+
+        >>> dd = defaultdict(list)
+        >>> df.to_dict('records', into=dd)
+        [defaultdict(<class 'list'>, {'col1': 1, 'col2': 0.5}),
+         defaultdict(<class 'list'>, {'col1': 2, 'col2': 0.75})]
+        """  # noqa: E501
+        orient = orient.lower()
+
+        if orient == "series":
+            # Special case needed to avoid converting
+            # cudf.Series objects into pd.Series
+            into_c = pd.core.common.standardize_mapping(into)
+            return into_c((k, v) for k, v in self.items())
+
+        return self.to_pandas().to_dict(orient=orient, into=into)
+
     @_cudf_nvtx_annotate
     def scatter_by_map(
         self, map_index, map_size=None, keep_index=True, **kwargs
@@ -2310,7 +2534,7 @@ def reindex(
             Return a new object, even if the passed indexes are the same.
         level : Not supported
         fill_value : Value to use for missing values.
-            Defaults to ``NA``, but can be any “compatible” value.
+            Defaults to ``NA``, but can be any "compatible" value.
         limit : Not supported
         tolerance : Not supported
 
@@ -2375,7 +2599,7 @@ def reindex(
         IE10               404       <NA>
         Konqueror          301       <NA>
 
-        Or we can use “axis-style” keyword arguments
+        Or we can use "axis-style" keyword arguments
         >>> df.reindex(columns=['http_status', 'user_agent'])
                 http_status user_agent
         Firefox            200       <NA>
@@ -3045,7 +3269,7 @@ def rename(
         """Alter column and index labels.
 
         Function / dict values must be unique (1-to-1). Labels not contained in
-        a dict / Series will be left as-is. Extra labels listed don’t throw an
+        a dict / Series will be left as-is. Extra labels listed don't throw an
         error.
 
         ``DataFrame.rename`` supports two calling conventions:
@@ -3333,7 +3557,11 @@ def agg(self, aggs, axis=None):
 
     @_cudf_nvtx_annotate
     def nlargest(self, n, columns, keep="first"):
-        """Get the rows of the DataFrame sorted by the n largest value of *columns*
+        """Return the first *n* rows ordered by *columns* in descending order.
+
+        Return the first *n* rows with the largest values in *columns*, in
+        descending order. The columns that are not specified are returned as
+        well, but not used for ordering.
 
         Parameters
         ----------
@@ -3396,7 +3624,11 @@ def nlargest(self, n, columns, keep="first"):
         return self._n_largest_or_smallest(True, n, columns, keep)
 
     def nsmallest(self, n, columns, keep="first"):
-        """Get the rows of the DataFrame sorted by the n smallest value of *columns*
+        """Return the first *n* rows ordered by *columns* in ascending order.
+
+        Return the first *n* rows with the smallest values in *columns*, in
+        ascending order. The columns that are not specified are returned as
+        well, but not used for ordering.
 
         Parameters
         ----------
@@ -3644,8 +3876,8 @@ def merge(
             If on is None and not merging on indexes then
             this defaults to the intersection of the columns
             in both DataFrames.
-        how : {‘left’, ‘outer’, ‘inner’, 'leftsemi', 'leftanti'}, \
-            default ‘inner’
+        how : {'left', 'outer', 'inner', 'leftsemi', 'leftanti'}, \
+            default 'inner'
             Type of merge to be performed.
 
             - left : use only keys from left frame, similar to a SQL left
@@ -3951,7 +4183,7 @@ def apply(
         ``apply`` relies on Numba to JIT compile ``func``.
         Thus the allowed operations within ``func`` are limited to `those
         supported by the CUDA Python Numba target
-        <https://numba.pydata.org/numba-doc/latest/cuda/cudapysupported.html>`__.
+        <https://numba.readthedocs.io/en/stable/cuda/cudapysupported.html>`__.
         For more information, see the `cuDF guide to user defined functions
         <https://docs.rapids.ai/api/cudf/stable/user_guide/guide-to-udfs.html>`__.
 
@@ -4306,7 +4538,7 @@ def apply_chunks(
         respectively (See `numba CUDA kernel documentation`_).
 
         .. _numba CUDA kernel documentation:\
-        http://numba.pydata.org/numba-doc/latest/cuda/kernels.html
+        https://numba.readthedocs.io/en/stable/cuda/kernels.html
 
         In the example below, the *kernel* is invoked concurrently on each
         specified chunk. The *kernel* computes the corresponding output
@@ -5204,9 +5436,10 @@ def quantile(
         q=0.5,
         axis=0,
         numeric_only=True,
-        interpolation="linear",
+        interpolation=None,
         columns=None,
         exact=True,
+        method="single",
     ):
         """
         Return values at the given quantile.
@@ -5223,11 +5456,16 @@ def quantile(
         interpolation : {`linear`, `lower`, `higher`, `midpoint`, `nearest`}
             This parameter specifies the interpolation method to use,
             when the desired quantile lies between two data points i and j.
-            Default ``linear``.
+            Default is ``linear`` for ``method="single"``, and ``nearest``
+            for ``method="table"``.
         columns : list of str
             List of column names to include.
         exact : boolean
             Whether to use approximate or exact quantile algorithm.
+        method : {`single`, `table`}, default `single`
+            Whether to compute quantiles per-column ('single') or over all
+            columns ('table'). When 'table', the only allowed interpolation
+            methods are 'nearest', 'lower', and 'higher'.
 
         Returns
         -------
@@ -5280,39 +5518,62 @@ def quantile(
         if columns is None:
             columns = data_df._data.names
 
-        # Ensure that qs is non-scalar so that we always get a column back.
-        qs = [q] if is_scalar(q) else q
-        result = {}
-        for k in data_df._data.names:
-            if k in columns:
-                ser = data_df[k]
-                res = ser.quantile(
-                    qs,
-                    interpolation=interpolation,
-                    exact=exact,
-                    quant_index=False,
-                )._column
-                if len(res) == 0:
-                    res = column.column_empty_like(
-                        qs, dtype=ser.dtype, masked=True, newsize=len(qs)
-                    )
-                result[k] = res
+        if isinstance(q, numbers.Number):
+            q_is_number = True
+            qs = [float(q)]
+        elif pd.api.types.is_list_like(q):
+            q_is_number = False
+            qs = q
+        else:
+            msg = "`q` must be either a single element or list"
+            raise TypeError(msg)
 
-        result = DataFrame._from_data(result)
-        if isinstance(q, numbers.Number) and numeric_only:
-            result = result.fillna(np.nan).iloc[0]
-            result.index = data_df._data.to_pandas_index()
-            result.name = q
-            return result
+        if method == "table":
+            interpolation = interpolation or "nearest"
+            result = self._quantile_table(qs, interpolation.upper())
+
+            if q_is_number:
+                result = result.transpose()
+                return Series(
+                    data=result._columns[0], index=result.index, name=q
+                )
         else:
-            result.index = list(map(float, qs))
-            return result
+            # Ensure that qs is non-scalar so that we always get a column back.
+            interpolation = interpolation or "linear"
+            result = {}
+            for k in data_df._data.names:
+                if k in columns:
+                    ser = data_df[k]
+                    res = ser.quantile(
+                        qs,
+                        interpolation=interpolation,
+                        exact=exact,
+                        quant_index=False,
+                    )._column
+                    if len(res) == 0:
+                        res = column.column_empty_like(
+                            qs, dtype=ser.dtype, masked=True, newsize=len(qs)
+                        )
+                    result[k] = res
+            result = DataFrame._from_data(result)
+
+            if q_is_number and numeric_only:
+                result = result.fillna(np.nan).iloc[0]
+                result.index = data_df.keys()
+                result.name = q
+                return result
+
+        result.index = list(map(float, qs))
+        return result
 
     @_cudf_nvtx_annotate
     def quantiles(self, q=0.5, interpolation="nearest"):
         """
         Return values at the given quantile.
 
+        This API is now deprecated. Please use ``DataFrame.quantile``
+        with ``method='table'``.
+
         Parameters
         ----------
         q : float or array-like
@@ -5326,25 +5587,13 @@ def quantiles(self, q=0.5, interpolation="nearest"):
         -------
         DataFrame
         """
-        if isinstance(q, numbers.Number):
-            q_is_number = True
-            q = [float(q)]
-        elif pd.api.types.is_list_like(q):
-            q_is_number = False
-        else:
-            msg = "`q` must be either a single element or list"
-            raise TypeError(msg)
+        warnings.warn(
+            "DataFrame.quantiles is now deprecated. "
+            "Please use DataFrame.quantile with `method='table'`.",
+            FutureWarning,
+        )
 
-        result = self._quantiles(q, interpolation.upper())
-
-        if q_is_number:
-            result = result.transpose()
-            return Series(
-                data=result._columns[0], index=result.index, name=q[0]
-            )
-        else:
-            result.index = as_index(q)
-            return result
+        return self.quantile(q=q, interpolation=interpolation, method="table")
 
     @_cudf_nvtx_annotate
     def isin(self, values):
@@ -5355,7 +5604,7 @@ def isin(self, values):
         ----------
         values : iterable, Series, DataFrame or dict
             The result will only be true at a location if all
-            the labels match. If values is a Series, that’s the index.
+            the labels match. If values is a Series, that's the index.
             If values is a dict, the keys must be the column names,
             which must match. If values is a DataFrame, then both the
             index and column labels must match.
@@ -5879,7 +6128,7 @@ def _columns_view(self, columns):
 
     @_cudf_nvtx_annotate
     def select_dtypes(self, include=None, exclude=None):
-        """Return a subset of the DataFrame’s columns based on the column dtypes.
+        """Return a subset of the DataFrame's columns based on the column dtypes.
 
         Parameters
         ----------
@@ -5938,7 +6187,7 @@ def select_dtypes(self, include=None, exclude=None):
         3  False  2.0
         4   True  1.0
         5  False  2.0
-        """
+        """  # noqa: E501
 
         # code modified from:
         # https://github.com/pandas-dev/pandas/blob/master/pandas/core/frame.py#L3196
@@ -6011,11 +6260,51 @@ def select_dtypes(self, include=None, exclude=None):
         return df
 
     @ioutils.doc_to_parquet()
-    def to_parquet(self, path, *args, **kwargs):
+    def to_parquet(
+        self,
+        path,
+        engine="cudf",
+        compression="snappy",
+        index=None,
+        partition_cols=None,
+        partition_file_name=None,
+        partition_offsets=None,
+        statistics="ROWGROUP",
+        metadata_file_path=None,
+        int96_timestamps=False,
+        row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+        row_group_size_rows=None,
+        max_page_size_bytes=None,
+        max_page_size_rows=None,
+        storage_options=None,
+        return_metadata=False,
+        *args,
+        **kwargs,
+    ):
         """{docstring}"""
         from cudf.io import parquet
 
-        return parquet.to_parquet(self, path, *args, **kwargs)
+        return parquet.to_parquet(
+            self,
+            path=path,
+            engine=engine,
+            compression=compression,
+            index=index,
+            partition_cols=partition_cols,
+            partition_file_name=partition_file_name,
+            partition_offsets=partition_offsets,
+            statistics=statistics,
+            metadata_file_path=metadata_file_path,
+            int96_timestamps=int96_timestamps,
+            row_group_size_bytes=row_group_size_bytes,
+            row_group_size_rows=row_group_size_rows,
+            max_page_size_bytes=max_page_size_bytes,
+            max_page_size_rows=max_page_size_rows,
+            storage_options=storage_options,
+            return_metadata=return_metadata,
+            *args,
+            **kwargs,
+        )
 
     @ioutils.doc_to_feather()
     def to_feather(self, path, *args, **kwargs):
@@ -6058,11 +6347,33 @@ def to_csv(
         )
 
     @ioutils.doc_to_orc()
-    def to_orc(self, fname, compression="snappy", *args, **kwargs):
+    def to_orc(
+        self,
+        fname,
+        compression="snappy",
+        statistics="ROWGROUP",
+        stripe_size_bytes=None,
+        stripe_size_rows=None,
+        row_index_stride=None,
+        cols_as_map_type=None,
+        storage_options=None,
+        index=None,
+    ):
         """{docstring}"""
         from cudf.io import orc
 
-        orc.to_orc(self, fname, compression, *args, **kwargs)
+        return orc.to_orc(
+            df=self,
+            fname=fname,
+            compression=compression,
+            statistics=statistics,
+            stripe_size_bytes=stripe_size_bytes,
+            stripe_size_rows=stripe_size_rows,
+            row_index_stride=row_index_stride,
+            cols_as_map_type=cols_as_map_type,
+            storage_options=storage_options,
+            index=index,
+        )
 
     @_cudf_nvtx_annotate
     def stack(self, level=-1, dropna=True):
@@ -6416,11 +6727,39 @@ def append(
     @_cudf_nvtx_annotate
     @copy_docstring(reshape.pivot)
     def pivot(self, index, columns, values=None):
-
         return cudf.core.reshape.pivot(
             self, index=index, columns=columns, values=values
         )
 
+    @_cudf_nvtx_annotate
+    @copy_docstring(reshape.pivot_table)
+    def pivot_table(
+        self,
+        values=None,
+        index=None,
+        columns=None,
+        aggfunc="mean",
+        fill_value=None,
+        margins=False,
+        dropna=None,
+        margins_name="All",
+        observed=False,
+        sort=True,
+    ):
+        return cudf.core.reshape.pivot_table(
+            self,
+            values=values,
+            index=index,
+            columns=columns,
+            aggfunc=aggfunc,
+            fill_value=fill_value,
+            margins=margins,
+            dropna=dropna,
+            margins_name=margins_name,
+            observed=observed,
+            sort=sort,
+        )
+
     @_cudf_nvtx_annotate
     @copy_docstring(reshape.unstack)
     def unstack(self, level=-1, fill_value=None):
@@ -7346,3 +7685,11 @@ def _reassign_categories(categories, cols, col_idxs):
                 offset=cols[name].offset,
                 size=cols[name].size,
             )
+
+
+def _from_dict_create_index(indexlist, namelist, library):
+    if len(namelist) > 1:
+        index = library.MultiIndex.from_tuples(indexlist, names=namelist)
+    else:
+        index = library.Index(indexlist, name=namelist[0])
+    return index
diff --git a/python/cudf/cudf/core/df_protocol.py b/python/cudf/cudf/core/df_protocol.py
index d770f4f6130..b38d3048ed7 100644
--- a/python/cudf/cudf/core/df_protocol.py
+++ b/python/cudf/cudf/core/df_protocol.py
@@ -18,7 +18,7 @@
 from numba.cuda import as_cuda_array
 
 import cudf
-from cudf.core.buffer import Buffer, DeviceBufferLike
+from cudf.core.buffer import Buffer, as_buffer
 from cudf.core.column import as_column, build_categorical_column, build_column
 
 # Implementation of interchange protocol classes
@@ -64,12 +64,12 @@ class _CuDFBuffer:
 
     def __init__(
         self,
-        buf: DeviceBufferLike,
+        buf: Buffer,
         dtype: np.dtype,
         allow_copy: bool = True,
     ) -> None:
         """
-        Use DeviceBufferLike object.
+        Use Buffer object.
         """
         # Store the cudf buffer where the data resides as a private
         # attribute, so we can use it to retrieve the public attributes
@@ -80,7 +80,7 @@ def __init__(
     @property
     def bufsize(self) -> int:
         """
-        The DeviceBufferLike size in bytes.
+        The Buffer size in bytes.
         """
         return self._buf.size
 
@@ -627,7 +627,7 @@ def __dataframe__(
 Notes
 -----
 
-- Interpreting a raw pointer (as in ``DeviceBufferLike.ptr``) is annoying and
+- Interpreting a raw pointer (as in ``Buffer.ptr``) is annoying and
   unsafe to do in pure Python. It's more general but definitely less friendly
   than having ``to_arrow`` and ``to_numpy`` methods. So for the buffers which
   lack ``__dlpack__`` (e.g., because the column dtype isn't supported by
@@ -721,7 +721,9 @@ def _protocol_to_cudf_column_numeric(
     _dbuffer, _ddtype = buffers["data"]
     _check_buffer_is_on_gpu(_dbuffer)
     cudfcol_num = build_column(
-        Buffer(data=_dbuffer.ptr, size=_dbuffer.bufsize, owner=None),
+        as_buffer(
+            data=_dbuffer.ptr, size=_dbuffer.bufsize, owner=None, exposed=True
+        ),
         protocol_dtype_to_cupy_dtype(_ddtype),
     )
     return _set_missing_values(col, cudfcol_num), buffers
@@ -751,8 +753,10 @@ def _set_missing_values(
     valid_mask = protocol_col.get_buffers()["validity"]
     if valid_mask is not None:
         bitmask = cp.asarray(
-            Buffer(
-                data=valid_mask[0].ptr, size=valid_mask[0].bufsize, owner=None
+            as_buffer(
+                data=valid_mask[0].ptr,
+                size=valid_mask[0].bufsize,
+                exposed=True,
             ),
             cp.bool8,
         )
@@ -792,7 +796,9 @@ def _protocol_to_cudf_column_categorical(
     _check_buffer_is_on_gpu(codes_buffer)
     cdtype = protocol_dtype_to_cupy_dtype(codes_dtype)
     codes = build_column(
-        Buffer(data=codes_buffer.ptr, size=codes_buffer.bufsize, owner=None),
+        as_buffer(
+            data=codes_buffer.ptr, size=codes_buffer.bufsize, exposed=True
+        ),
         cdtype,
     )
 
@@ -824,7 +830,9 @@ def _protocol_to_cudf_column_string(
     data_buffer, data_dtype = buffers["data"]
     _check_buffer_is_on_gpu(data_buffer)
     encoded_string = build_column(
-        Buffer(data=data_buffer.ptr, size=data_buffer.bufsize, owner=None),
+        as_buffer(
+            data=data_buffer.ptr, size=data_buffer.bufsize, exposed=True
+        ),
         protocol_dtype_to_cupy_dtype(data_dtype),
     )
 
@@ -834,7 +842,9 @@ def _protocol_to_cudf_column_string(
     offset_buffer, offset_dtype = buffers["offsets"]
     _check_buffer_is_on_gpu(offset_buffer)
     offsets = build_column(
-        Buffer(data=offset_buffer.ptr, size=offset_buffer.bufsize, owner=None),
+        as_buffer(
+            data=offset_buffer.ptr, size=offset_buffer.bufsize, exposed=True
+        ),
         protocol_dtype_to_cupy_dtype(offset_dtype),
     )
 
diff --git a/python/cudf/cudf/core/dtypes.py b/python/cudf/cudf/core/dtypes.py
index 5cff057ce7c..39c7b8e6b57 100644
--- a/python/cudf/cudf/core/dtypes.py
+++ b/python/cudf/cudf/core/dtypes.py
@@ -3,6 +3,8 @@
 import decimal
 import operator
 import pickle
+import textwrap
+from functools import cached_property
 from typing import Any, Callable, Dict, List, Tuple, Type, Union
 
 import numpy as np
@@ -19,7 +21,8 @@
 from cudf._typing import Dtype
 from cudf.core._compat import PANDAS_GE_130, PANDAS_GE_150
 from cudf.core.abc import Serializable
-from cudf.core.buffer import DeviceBufferLike
+from cudf.core.buffer import Buffer
+from cudf.utils.docutils import doc_apply
 
 if PANDAS_GE_150:
     from pandas.core.arrays.arrow.extension_types import ArrowIntervalType
@@ -152,14 +155,22 @@ class CategoricalDtype(_BaseDtype):
     Categories (2, object): ['b' < 'a']
     """
 
-    ordered: bool
-
     def __init__(self, categories=None, ordered: bool = False) -> None:
         self._categories = self._init_categories(categories)
-        self.ordered = ordered
+        self._ordered = ordered
 
     @property
     def categories(self) -> "cudf.core.index.BaseIndex":
+        """
+        An ``Index`` containing the unique categories allowed.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> dtype = cudf.CategoricalDtype(categories=['b', 'a'], ordered=True)
+        >>> dtype.categories
+        StringIndex(['b' 'a'], dtype='object')
+        """
         if self._categories is None:
             return cudf.core.index.as_index(
                 cudf.core.column.column_empty(0, dtype="object", masked=False)
@@ -178,13 +189,50 @@ def name(self):
     def str(self):
         return "|O08"
 
+    @property
+    def ordered(self) -> bool:
+        """
+        Whether the categories have an ordered relationship.
+        """
+        return self._ordered
+
+    @ordered.setter
+    def ordered(self, value) -> None:
+        self._ordered = value
+
     @classmethod
     def from_pandas(cls, dtype: pd.CategoricalDtype) -> "CategoricalDtype":
+        """
+        Convert a ``pandas.CategrocialDtype`` to ``cudf.CategoricalDtype``
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pandas as pd
+        >>> pd_dtype = pd.CategoricalDtype(categories=['b', 'a'], ordered=True)
+        >>> pd_dtype
+        CategoricalDtype(categories=['b', 'a'], ordered=True)
+        >>> cudf_dtype = cudf.CategoricalDtype.from_pandas(pd_dtype)
+        >>> cudf_dtype
+        CategoricalDtype(categories=['b', 'a'], ordered=True)
+        """
         return CategoricalDtype(
             categories=dtype.categories, ordered=dtype.ordered
         )
 
     def to_pandas(self) -> pd.CategoricalDtype:
+        """
+        Convert a ``cudf.CategoricalDtype`` to ``pandas.CategoricalDtype``
+
+        Examples
+        --------
+        >>> import cudf
+        >>> dtype = cudf.CategoricalDtype(categories=['b', 'a'], ordered=True)
+        >>> dtype
+        CategoricalDtype(categories=['b', 'a'], ordered=True)
+        >>> dtype.to_pandas()
+        CategoricalDtype(categories=['b', 'a'], ordered=True)
+        """
         if self._categories is None:
             categories = None
         else:
@@ -257,8 +305,33 @@ def deserialize(cls, header, frames):
         )
         return klass(categories=categories, ordered=ordered)
 
+    def __repr__(self):
+        return self.to_pandas().__repr__()
+
 
 class ListDtype(_BaseDtype):
+    """
+    Type to represent list data.
+
+    Parameters
+    ----------
+    element_type : object
+        A dtype with which represents the element types in the list.
+
+    Examples
+    --------
+    >>> import cudf
+    >>> list_dtype = cudf.ListDtype("int32")
+    >>> list_dtype
+    ListDtype(int32)
+
+    A nested list dtype can be created by:
+
+    >>> nested_list_dtype = cudf.ListDtype(list_dtype)
+    >>> nested_list_dtype
+    ListDtype(ListDtype(int32))
+    """
+
     _typ: pa.ListType
     name: str = "list"
 
@@ -273,6 +346,26 @@ def __init__(self, element_type: Any) -> None:
 
     @property
     def element_type(self) -> Dtype:
+        """
+        Returns the element type of the ``ListDtype``.
+
+        Returns
+        -------
+        Dtype
+
+        Examples
+        --------
+        >>> import cudf
+        >>> deep_nested_type = cudf.ListDtype(cudf.ListDtype(cudf.ListDtype("float32")))
+        >>> deep_nested_type
+        ListDtype(ListDtype(ListDtype(float32)))
+        >>> deep_nested_type.element_type
+        ListDtype(ListDtype(float32))
+        >>> deep_nested_type.element_type.element_type
+        ListDtype(float32)
+        >>> deep_nested_type.element_type.element_type.element_type
+        'float32'
+        """  # noqa: E501
         if isinstance(self._typ.value_type, pa.ListType):
             return ListDtype.from_arrow(self._typ.value_type)
         elif isinstance(self._typ.value_type, pa.StructType):
@@ -282,6 +375,18 @@ def element_type(self) -> Dtype:
 
     @property
     def leaf_type(self):
+        """
+        Returns the type of the leaf values.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> deep_nested_type = cudf.ListDtype(cudf.ListDtype(cudf.ListDtype("float32")))
+        >>> deep_nested_type
+        ListDtype(ListDtype(ListDtype(float32)))
+        >>> deep_nested_type.leaf_type
+        'float32'
+        """  # noqa: E501
         if isinstance(self.element_type, ListDtype):
             return self.element_type.leaf_type
         else:
@@ -295,11 +400,47 @@ def type(self):
 
     @classmethod
     def from_arrow(cls, typ):
+        """
+        Creates a ``ListDtype`` from ``pyarrow.ListType``.
+
+        Parameters
+        ----------
+        typ : pyarrow.ListType
+            A ``pyarrow.ListType`` that has to be converted to
+            ``ListDtype``.
+
+        Returns
+        -------
+        obj : ``ListDtype``
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> arrow_type = pa.infer_type([[1]])
+        >>> arrow_type
+        ListType(list<item: int64>)
+        >>> list_dtype = cudf.ListDtype.from_arrow(arrow_type)
+        >>> list_dtype
+        ListDtype(int64)
+        """
         obj = object.__new__(cls)
         obj._typ = typ
         return obj
 
     def to_arrow(self):
+        """
+        Convert to a ``pyarrow.ListType``
+
+        Examples
+        --------
+        >>> import cudf
+        >>> list_dtype = cudf.ListDtype(cudf.ListDtype("float32"))
+        >>> list_dtype
+        ListDtype(ListDtype(float32))
+        >>> list_dtype.to_arrow()
+        ListType(list<item: list<item: float>>)
+        """
         return self._typ
 
     def __eq__(self, other):
@@ -345,9 +486,27 @@ def deserialize(cls, header: dict, frames: list):
 
 class StructDtype(_BaseDtype):
     """
+    Type to represent a struct data.
+
+    Parameters
+    ----------
     fields : dict
-        A mapping of field names to dtypes
-    """
+        A mapping of field names to dtypes, the dtypes can themselves
+        be of ``StructDtype`` too.
+
+    Examples
+    --------
+    >>> import cudf
+    >>> struct_dtype = cudf.StructDtype({"a": "int64", "b": "string"})
+    >>> struct_dtype
+    StructDtype({'a': dtype('int64'), 'b': dtype('O')})
+
+    A nested ``StructDtype`` can also be constructed in the following way:
+
+    >>> nested_struct_dtype = cudf.StructDtype({"dict_data": struct_dtype, "c": "uint8"})
+    >>> nested_struct_dtype
+    StructDtype({'dict_data': StructDtype({'a': dtype('int64'), 'b': dtype('O')}), 'c': dtype('uint8')})
+    """  # noqa: E501
 
     name = "struct"
 
@@ -360,6 +519,18 @@ def __init__(self, fields):
 
     @property
     def fields(self):
+        """
+        Returns an ordered dict of column name and dtype key-value.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> struct_dtype = cudf.StructDtype({"a": "int64", "b": "string"})
+        >>> struct_dtype
+        StructDtype({'a': dtype('int64'), 'b': dtype('O')})
+        >>> struct_dtype.fields
+        {'a': dtype('int64'), 'b': dtype('O')}
+        """
         return {
             field.name: cudf.utils.dtypes.cudf_dtype_from_pa_type(field.type)
             for field in self._typ
@@ -373,11 +544,36 @@ def type(self):
 
     @classmethod
     def from_arrow(cls, typ):
+        """
+        Convert a ``pyarrow.StructType`` to ``StructDtype``.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> pa_struct_type = pa.struct({'x': pa.int32(), 'y': pa.string()})
+        >>> pa_struct_type
+        StructType(struct<x: int32, y: string>)
+        >>> cudf.StructDtype.from_arrow(pa_struct_type)
+        StructDtype({'x': dtype('int32'), 'y': dtype('O')})
+        """
         obj = object.__new__(cls)
         obj._typ = typ
         return obj
 
     def to_arrow(self):
+        """
+        Convert a ``StructDtype`` to a ``pyarrow.StructType``.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> struct_type = cudf.StructDtype({"x": "int32", "y": "string"})
+        >>> struct_type
+        StructDtype({'x': dtype('int32'), 'y': dtype('O')})
+        >>> struct_type.to_arrow()
+        StructType(struct<x: int32, y: string>)
+        """
         return self._typ
 
     def __eq__(self, other):
@@ -397,7 +593,7 @@ def serialize(self) -> Tuple[dict, list]:
         header: Dict[str, Any] = {}
         header["type-serialized"] = pickle.dumps(type(self))
 
-        frames: List[DeviceBufferLike] = []
+        frames: List[Buffer] = []
 
         fields: Dict[str, Union[bytes, Tuple[Any, Tuple[int, int]]]] = {}
 
@@ -432,31 +628,50 @@ def deserialize(cls, header: dict, frames: list):
                 fields[k] = pickle.loads(dtype)
         return cls(fields)
 
+    @cached_property
+    def itemsize(self):
+        return sum(
+            cudf.utils.dtypes.cudf_dtype_from_pa_type(field.type).itemsize
+            for field in self._typ
+        )
 
-class DecimalDtype(_BaseDtype):
-    """
-    Parameters
-    ----------
-    precision : int
-        The total number of digits in each value of this dtype
-    scale : int, optional
-        The scale of the dtype. See Notes below.
-
-    Notes
-    -----
-        When the scale is positive:
-            - numbers with fractional parts (e.g., 0.0042) can be represented
-            - the scale is the total number of digits to the right of the
-            decimal point
-        When the scale is negative:
-            - only multiples of powers of 10 (including 10**0) can be
-            represented (e.g., 1729, 4200, 1000000)
-            - the scale represents the number of trailing zeros in the value.
-        For example, 42 is representable with precision=2 and scale=0.
-        13.0051 is representable with precision=6 and scale=4,
-        and *not* representable with precision<6 or scale<4.
+
+decimal_dtype_template = textwrap.dedent(
     """
+        Type to represent a ``decimal{size}`` data.
+
+        Parameters
+        ----------
+        precision : int
+            The total number of digits in each value of this dtype
+        scale : int, optional
+            The scale of the dtype. See Notes below.
+
+        Notes
+        -----
+            When the scale is positive:
+                - numbers with fractional parts (e.g., 0.0042) can be represented
+                - the scale is the total number of digits to the right of the
+                decimal point
+            When the scale is negative:
+                - only multiples of powers of 10 (including 10**0) can be
+                represented (e.g., 1729, 4200, 1000000)
+                - the scale represents the number of trailing zeros in the value.
+            For example, 42 is representable with precision=2 and scale=0.
+            13.0051 is representable with precision=6 and scale=4,
+            and *not* representable with precision<6 or scale<4.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> decimal{size}_dtype = cudf.Decimal{size}Dtype(precision=9, scale=2)
+        >>> decimal{size}_dtype
+        Decimal{size}Dtype(precision=9, scale=2)
+    """  # noqa: E501
+)
 
+
+class DecimalDtype(_BaseDtype):
     _metadata = ("precision", "scale")
 
     def __init__(self, precision, scale=0):
@@ -469,6 +684,9 @@ def str(self):
 
     @property
     def precision(self):
+        """
+        The decimal precision, in number of decimal digits (an integer).
+        """
         return self._typ.precision
 
     @precision.setter
@@ -478,10 +696,16 @@ def precision(self, value):
 
     @property
     def scale(self):
+        """
+        The decimal scale (an integer).
+        """
         return self._typ.scale
 
     @property
     def itemsize(self):
+        """
+        Length of one column element in bytes.
+        """
         return self.ITEMSIZE
 
     @property
@@ -490,10 +714,37 @@ def type(self):
         return decimal.Decimal
 
     def to_arrow(self):
+        """
+        Return the equivalent ``pyarrow`` dtype.
+        """
         return self._typ
 
     @classmethod
     def from_arrow(cls, typ):
+        """
+        Construct a cudf decimal dtype from a ``pyarrow`` dtype
+
+        Examples
+        --------
+        >>> import cudf
+        >>> import pyarrow as pa
+        >>> pa_type = pa.decimal128(precision=9, scale=2)
+
+        Constructing a ``Decimal32Dtype``:
+
+        >>> cudf.Decimal32Dtype.from_arrow(pa_type)
+        Decimal64Dtype(precision=9, scale=2)
+
+        Constructing a ``Decimal64Dtype``:
+
+        >>> cudf.Decimal64Dtype.from_arrow(pa_type)
+        Decimal64Dtype(precision=9, scale=2)
+
+        Constructing a ``Decimal128Dtype``:
+
+        >>> cudf.Decimal128Dtype.from_arrow(pa_type)
+        Decimal128Dtype(precision=9, scale=2)
+        """
         return cls(typ.precision, typ.scale)
 
     def __repr__(self):
@@ -551,18 +802,33 @@ def __hash__(self):
         return hash(self._typ)
 
 
+@doc_apply(
+    decimal_dtype_template.format(
+        size="32",
+    )
+)
 class Decimal32Dtype(DecimalDtype):
     name = "decimal32"
     MAX_PRECISION = np.floor(np.log10(np.iinfo("int32").max))
     ITEMSIZE = 4
 
 
+@doc_apply(
+    decimal_dtype_template.format(
+        size="64",
+    )
+)
 class Decimal64Dtype(DecimalDtype):
     name = "decimal64"
     MAX_PRECISION = np.floor(np.log10(np.iinfo("int64").max))
     ITEMSIZE = 8
 
 
+@doc_apply(
+    decimal_dtype_template.format(
+        size="128",
+    )
+)
 class Decimal128Dtype(DecimalDtype):
     name = "decimal128"
     MAX_PRECISION = 38
diff --git a/python/cudf/cudf/core/frame.py b/python/cudf/cudf/core/frame.py
index 4fb914a6409..687338f882d 100644
--- a/python/cudf/cudf/core/frame.py
+++ b/python/cudf/cudf/core/frame.py
@@ -321,15 +321,19 @@ def __len__(self):
 
     @_cudf_nvtx_annotate
     def astype(self, dtype, copy=False, **kwargs):
-        result = {}
+        result_data = {}
         for col_name, col in self._data.items():
             dt = dtype.get(col_name, col.dtype)
             if not is_dtype_equal(dt, col.dtype):
-                result[col_name] = col.astype(dt, copy=copy, **kwargs)
+                result_data[col_name] = col.astype(dt, copy=copy, **kwargs)
             else:
-                result[col_name] = col.copy() if copy else col
+                result_data[col_name] = col.copy() if copy else col
 
-        return result
+        return ColumnAccessor._create_unsafe(
+            data=result_data,
+            multiindex=self._data.multiindex,
+            level_names=self._data.level_names,
+        )
 
     @_cudf_nvtx_annotate
     def equals(self, other):
@@ -953,7 +957,7 @@ def _drop_na_columns(self, how="any", subset=None, thresh=None):
         return self[out_cols]
 
     @_cudf_nvtx_annotate
-    def _quantiles(
+    def _quantile_table(
         self,
         q,
         interpolation="LINEAR",
@@ -972,7 +976,7 @@ def _quantiles(
         ]
 
         return self._from_columns_like_self(
-            libcudf.quantiles.quantiles(
+            libcudf.quantiles.quantile_table(
                 [*self._columns],
                 q,
                 interpolation,
@@ -1363,12 +1367,12 @@ def searchsorted(
         ----------
         value : Frame (Shape must be consistent with self)
             Values to be hypothetically inserted into Self
-        side : str {‘left’, ‘right’} optional, default ‘left‘
-            If ‘left’, the index of the first suitable location found is given
-            If ‘right’, return the last such index
+        side : str {'left', 'right'} optional, default 'left'
+            If 'left', the index of the first suitable location found is given
+            If 'right', return the last such index
         ascending : bool optional, default True
             Sorted Frame is in ascending order (otherwise descending)
-        na_position : str {‘last’, ‘first’} optional, default ‘last‘
+        na_position : str {'last', 'first'} optional, default 'last'
             Position of null values in sorted order
 
         Returns
@@ -1412,7 +1416,7 @@ def searchsorted(
         >>> df.searchsorted(values_df, ascending=False)
         array([4, 4, 4, 0], dtype=int32)
         """
-        # Call libcudf++ search_sorted primitive
+        # Call libcudf search_sorted primitive
 
         if na_position not in {"first", "last"}:
             raise ValueError(f"invalid na_position: {na_position}")
@@ -1476,8 +1480,8 @@ def argsort(
             Has no effect but is accepted for compatibility with numpy.
         ascending : bool or list of bool, default True
             If True, sort values in ascending order, otherwise descending.
-        na_position : {‘first’ or ‘last’}, default ‘last’
-            Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs
+        na_position : {'first' or 'last'}, default 'last'
+            Argument 'first' puts NaNs at the beginning, 'last' puts NaNs
             at the end.
 
         Returns
diff --git a/python/cudf/cudf/core/groupby/groupby.py b/python/cudf/cudf/core/groupby/groupby.py
index c96407a7ff9..e4ea59c1f15 100644
--- a/python/cudf/cudf/core/groupby/groupby.py
+++ b/python/cudf/cudf/core/groupby/groupby.py
@@ -8,11 +8,13 @@
 from functools import cached_property
 from typing import Any, Iterable, List, Tuple, Union
 
+import cupy as cp
 import numpy as np
 import pandas as pd
 
 import cudf
 from cudf._lib import groupby as libgroupby
+from cudf._lib.null_mask import bitmask_or
 from cudf._lib.reshape import interleave_columns
 from cudf._typing import AggType, DataFrameOrSeries, MultiColumnAggType
 from cudf.api.types import is_list_like
@@ -50,9 +52,9 @@ def _quantile_75(x):
 ----------
 by : mapping, function, label, or list of labels
     Used to determine the groups for the groupby. If by is a
-    function, it’s called on each value of the object’s index.
+    function, it's called on each value of the object's index.
     If a dict or Series is passed, the Series or dict VALUES will
-    be used to determine the groups (the Series’ values are first
+    be used to determine the groups (the Series' values are first
     aligned; see .align() method). If an cupy array is passed, the
     values are used as-is determine the groups. A label or list
     of labels may be passed to group by the columns in self.
@@ -63,7 +65,7 @@ def _quantile_75(x):
 as_index : bool, default True
     For aggregated output, return object with group labels as
     the index. Only relevant for DataFrame input.
-    as_index=False is effectively “SQL-style” grouped output.
+    as_index=False is effectively "SQL-style" grouped output.
 sort : bool, default False
     Sort result by group key. Differ from Pandas, cudf defaults to
     ``False`` for better performance. Note this does not influence
@@ -544,6 +546,88 @@ def nth(self, n):
 
         return result[sizes > n]
 
+    def ngroup(self, ascending=True):
+        """
+        Number each group from 0 to the number of groups - 1.
+
+        This is the enumerative complement of cumcount. Note that the
+        numbers given to the groups match the order in which the groups
+        would be seen when iterating over the groupby object, not the
+        order they are first observed.
+
+        Parameters
+        ----------
+        ascending : bool, default True
+            If False, number in reverse, from number of group - 1 to 0.
+
+        Returns
+        -------
+        Series
+            Unique numbers for each group.
+
+        See Also
+        --------
+        .cumcount : Number the rows in each group.
+
+        Examples
+        --------
+        >>> df = cudf.DataFrame({"A": list("aaabba")})
+        >>> df
+           A
+        0  a
+        1  a
+        2  a
+        3  b
+        4  b
+        5  a
+        >>> df.groupby('A').ngroup()
+        0    0
+        1    0
+        2    0
+        3    1
+        4    1
+        5    0
+        dtype: int64
+        >>> df.groupby('A').ngroup(ascending=False)
+        0    1
+        1    1
+        2    1
+        3    0
+        4    0
+        5    1
+        dtype: int64
+        >>> df.groupby(["A", [1,1,2,3,2,1]]).ngroup()
+        0    0
+        1    0
+        2    1
+        3    3
+        4    2
+        5    0
+        dtype: int64
+        """
+        num_groups = len(index := self.grouping.keys.unique())
+        _, has_null_group = bitmask_or([*index._columns])
+
+        if ascending:
+            if has_null_group:
+                group_ids = cudf.Series._from_data(
+                    {None: cp.arange(-1, num_groups - 1)}
+                )
+            else:
+                group_ids = cudf.Series._from_data(
+                    {None: cp.arange(num_groups)}
+                )
+        else:
+            group_ids = cudf.Series._from_data(
+                {None: cp.arange(num_groups - 1, -1, -1)}
+            )
+
+        if has_null_group:
+            group_ids.iloc[0] = cudf.NA
+
+        group_ids._index = index
+        return self._broadcast(group_ids)
+
     def serialize(self):
         header = {}
         frames = []
@@ -633,7 +717,7 @@ def _normalize_aggs(
     def pipe(self, func, *args, **kwargs):
         """
         Apply a function `func` with arguments to this GroupBy
-        object and return the function’s result.
+        object and return the function's result.
 
         Parameters
         ----------
@@ -925,6 +1009,29 @@ def rolling_avg(val, avg):
         kwargs.update({"chunks": offsets})
         return grouped_values.apply_chunks(function, **kwargs)
 
+    def _broadcast(self, values):
+        """
+        Broadcast the results of an aggregation to the group
+
+        Parameters
+        ----------
+        values: Series
+            A Series representing the results of an aggregation.  The
+            index of the Series must be the (unique) values
+            representing the group keys.
+
+        Returns
+        -------
+        A Series of the same size and with the same index as
+        ``self.obj``.
+        """
+        if not values.index.equals(self.grouping.keys):
+            values = values._align_to_index(
+                self.grouping.keys, how="right", allow_non_unique=True
+            )
+            values.index = self.obj.index
+        return values
+
     def transform(self, function):
         """Apply an aggregation, then broadcast the result to the group size.
 
@@ -966,12 +1073,7 @@ def transform(self, function):
                 "Currently, `transform()` supports only aggregations."
             ) from e
 
-        if not result.index.equals(self.grouping.keys):
-            result = result._align_to_index(
-                self.grouping.keys, how="right", allow_non_unique=True
-            )
-            result.index = self.obj.index
-        return result
+        return self._broadcast(result)
 
     def rolling(self, *args, **kwargs):
         """
@@ -1001,13 +1103,13 @@ def func(x):
     def describe(self, include=None, exclude=None):
         """
         Generate descriptive statistics that summarizes the central tendency,
-        dispersion and shape of a dataset’s distribution, excluding NaN values.
+        dispersion and shape of a dataset's distribution, excluding NaN values.
 
         Analyzes numeric DataFrames only
 
         Parameters
         ----------
-        include: ‘all’, list-like of dtypes or None (default), optional
+        include: 'all', list-like of dtypes or None (default), optional
             list of data types to include in the result.
             Ignored for Series.
 
diff --git a/python/cudf/cudf/core/index.py b/python/cudf/cudf/core/index.py
index b6ae7beebc5..e561dd0a214 100644
--- a/python/cudf/cudf/core/index.py
+++ b/python/cudf/cudf/core/index.py
@@ -33,6 +33,8 @@
     is_categorical_dtype,
     is_dtype_equal,
     is_interval_dtype,
+    is_list_like,
+    is_scalar,
     is_string_dtype,
 )
 from cudf.core._base_index import BaseIndex, _index_astype_docstring
@@ -55,7 +57,12 @@
 from cudf.core.mixins import BinaryOperand
 from cudf.core.single_column_frame import SingleColumnFrame
 from cudf.utils.docutils import copy_docstring, doc_apply
-from cudf.utils.dtypes import _maybe_convert_to_default_type, find_common_type
+from cudf.utils.dtypes import (
+    _maybe_convert_to_default_type,
+    find_common_type,
+    is_mixed_with_object_dtype,
+    numeric_normalize_types,
+)
 from cudf.utils.utils import _cudf_nvtx_annotate, search_range
 
 T = TypeVar("T", bound="Frame")
@@ -111,6 +118,11 @@ def _index_from_data(data: MutableMapping, name: Any = None):
             index_class_type = CategoricalIndex
         elif isinstance(values, (IntervalColumn, StructColumn)):
             index_class_type = IntervalIndex
+        else:
+            raise NotImplementedError(
+                "Unsupported column type passed to "
+                f"create an Index: {type(values)}"
+            )
     else:
         index_class_type = cudf.MultiIndex
     return index_class_type._from_data(data, name)
@@ -243,6 +255,9 @@ def _values(self):
         else:
             return column.column_empty(0, masked=False, dtype=self.dtype)
 
+    def _clean_nulls_from_index(self):
+        return self
+
     def is_numeric(self):
         return True
 
@@ -867,15 +882,33 @@ def min(self):
     def max(self):
         return self._minmax("max")
 
+    @property
+    def values(self):
+        return cupy.arange(self.start, self.stop, self.step)
 
-# Patch in all binops and unary ops, which bypass __getattr__ on the instance
-# and prevent the above overload from working.
-for unaop in ("__neg__", "__pos__", "__abs__"):
-    setattr(
-        RangeIndex,
-        unaop,
-        lambda self, op=unaop: getattr(self._as_int64(), op)(),
-    )
+    def any(self):
+        return any(self._range)
+
+    def append(self, other):
+        return self._as_int_index().append(other)
+
+    def isin(self, values):
+        if is_scalar(values):
+            raise TypeError(
+                "only list-like objects are allowed to be passed "
+                f"to isin(), you passed a {type(values).__name__}"
+            )
+
+        return self._values.isin(values).values
+
+    def __neg__(self):
+        return -self._as_int_index()
+
+    def __pos__(self):
+        return +self._as_int_index()
+
+    def __abs__(self):
+        return abs(self._as_int_index())
 
 
 class GenericIndex(SingleColumnFrame, BaseIndex):
@@ -1029,7 +1062,7 @@ def equals(self, other, **kwargs):
         Returns
         -------
         out: bool
-            True if “other” is an Index and it has the same elements
+            True if "other" is an Index and it has the same elements
             as calling index; False otherwise.
         """
         if (
@@ -1381,8 +1414,8 @@ def argsort(
             Has no effect but is accepted for compatibility with numpy.
         ascending : bool or list of bool, default True
             If True, sort values in ascending order, otherwise descending.
-        na_position : {‘first’ or ‘last’}, default ‘last’
-            Argument ‘first’ puts NaNs at the beginning, ‘last’ puts NaNs
+        na_position : {'first' or 'last'}, default 'last'
+            Argument 'first' puts NaNs at the beginning, 'last' puts NaNs
             at the end.
 
         Returns
@@ -1410,6 +1443,81 @@ def where(self, cond, other=None, inplace=False):
             inplace=inplace,
         )
 
+    @property
+    def values(self):
+        return self._column.values
+
+    def __contains__(self, item):
+        return item in self._values
+
+    def _clean_nulls_from_index(self):
+        if self._values.has_nulls():
+            return cudf.Index(
+                self._values.astype("str").fillna(cudf._NA_REP), name=self.name
+            )
+
+        return self
+
+    def any(self):
+        return self._values.any()
+
+    def to_pandas(self):
+        return pd.Index(self._values.to_pandas(), name=self.name)
+
+    def append(self, other):
+        if is_list_like(other):
+            to_concat = [self]
+            to_concat.extend(other)
+        else:
+            this = self
+            if len(other) == 0:
+                # short-circuit and return a copy
+                to_concat = [self]
+
+            other = cudf.Index(other)
+
+            if len(self) == 0:
+                to_concat = [other]
+
+            if len(self) and len(other):
+                if is_mixed_with_object_dtype(this, other):
+                    got_dtype = (
+                        other.dtype
+                        if this.dtype == cudf.dtype("object")
+                        else this.dtype
+                    )
+                    raise TypeError(
+                        f"cudf does not support appending an Index of "
+                        f"dtype `{cudf.dtype('object')}` with an Index "
+                        f"of dtype `{got_dtype}`, please type-cast "
+                        f"either one of them to same dtypes."
+                    )
+
+                if isinstance(self._values, cudf.core.column.NumericalColumn):
+                    if self.dtype != other.dtype:
+                        this, other = numeric_normalize_types(self, other)
+                to_concat = [this, other]
+
+        for obj in to_concat:
+            if not isinstance(obj, BaseIndex):
+                raise TypeError("all inputs must be Index")
+
+        return self._concat(to_concat)
+
+    def unique(self):
+        return cudf.core.index._index_from_data(
+            {self.name: self._values.unique()}, name=self.name
+        )
+
+    def isin(self, values):
+        if is_scalar(values):
+            raise TypeError(
+                "only list-like objects are allowed to be passed "
+                f"to isin(), you passed a {type(values).__name__}"
+            )
+
+        return self._values.isin(values).values
+
 
 class NumericIndex(GenericIndex):
     """Immutable, ordered and sliceable sequence of labels.
@@ -1745,7 +1853,7 @@ class DatetimeIndex(GenericIndex):
         This is not yet supported
     tz : pytz.timezone or dateutil.tz.tzfile
         This is not yet supported
-    ambiguous : ‘infer’, bool-ndarray, ‘NaT’, default ‘raise’
+    ambiguous : 'infer', bool-ndarray, 'NaT', default 'raise'
         This is not yet supported
     name : object
         Name to be stored in the index.
@@ -1943,6 +2051,59 @@ def second(self):
         """
         return self._get_dt_field("second")
 
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
+    def microsecond(self):
+        """
+        The microseconds of the datetime.
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> import cudf
+        >>> datetime_index = cudf.Index(pd.date_range("2000-01-01",
+        ...             periods=3, freq="us"))
+        >>> datetime_index
+        DatetimeIndex([       '2000-01-01 00:00:00', '2000-01-01 00:00:00.000001',
+               '2000-01-01 00:00:00.000002'],
+              dtype='datetime64[ns]')
+        >>> datetime_index.microsecond
+        Int32Index([0, 1, 2], dtype='int32')
+        """  # noqa: E501
+        return as_index(
+            (
+                # Need to manually promote column to int32 because
+                # pandas-matching binop behaviour requires that this
+                # __mul__ returns an int16 column.
+                self._values.get_dt_field("millisecond").astype("int32")
+                * cudf.Scalar(1000, dtype="int32")
+            )
+            + self._values.get_dt_field("microsecond"),
+            name=self.name,
+        )
+
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
+    def nanosecond(self):
+        """
+        The nanoseconds of the datetime.
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> import cudf
+        >>> datetime_index = cudf.Index(pd.date_range("2000-01-01",
+        ...             periods=3, freq="ns"))
+        >>> datetime_index
+        DatetimeIndex([          '2000-01-01 00:00:00',
+                       '2000-01-01 00:00:00.000000001',
+                       '2000-01-01 00:00:00.000000002'],
+                      dtype='datetime64[ns]')
+        >>> datetime_index.nanosecond
+        Int16Index([0, 1, 2], dtype='int16')
+        """
+        return self._get_dt_field("nanosecond")
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def weekday(self):
@@ -2389,7 +2550,7 @@ class CategoricalIndex(GenericIndex):
         Whether or not this categorical is treated as an ordered categorical.
         If not given here or in dtype, the resulting categorical will be
         unordered.
-    dtype : CategoricalDtype or “category”, optional
+    dtype : CategoricalDtype or "category", optional
         If CategoricalDtype, cannot be used together with categories or
         ordered.
     copy : bool, default False
@@ -2797,10 +2958,6 @@ def str(self):
         return StringMethods(parent=self)
 
     def _clean_nulls_from_index(self):
-        """
-        Convert all na values(if any) in Index object
-        to `<NA>` as a preprocessing step to `__repr__` methods.
-        """
         if self._values.has_nulls():
             return self.fillna(cudf._NA_REP)
         else:
@@ -2820,7 +2977,7 @@ def as_index(arbitrary, nan_as_null=None, **kwargs) -> BaseIndex:
     Currently supported inputs are:
 
     * ``Column``
-    * ``DeviceBufferLike``
+    * ``Buffer``
     * ``Series``
     * ``Index``
     * numba device array
diff --git a/python/cudf/cudf/core/indexed_frame.py b/python/cudf/cudf/core/indexed_frame.py
index 0acacc798a1..28039366725 100644
--- a/python/cudf/cudf/core/indexed_frame.py
+++ b/python/cudf/cudf/core/indexed_frame.py
@@ -270,13 +270,6 @@ def __init__(self, data=None, index=None):
         # to ensure that this constructor is always invoked with an index.
         self._index = index
 
-    def to_dict(self, *args, **kwargs):  # noqa: D102
-        raise TypeError(
-            "cuDF does not support conversion to host memory "
-            "via `to_dict()` method. Consider using "
-            "`.to_pandas().to_dict()` to construct a Python dictionary."
-        )
-
     @property
     def _num_rows(self) -> int:
         # Important to use the index because the data may be empty.
@@ -562,8 +555,8 @@ def replace(
             * dict:
                 - Dicts can be used to specify different replacement values
                   for different existing values. For example, {'a': 'b',
-                  'y': 'z'} replaces the value ‘a’ with ‘b’ and
-                  ‘y’ with ‘z’.
+                  'y': 'z'} replaces the value 'a' with 'b' and
+                  'y' with 'z'.
                   To use a dict in this way the ``value`` parameter should
                   be ``None``.
         value : scalar, dict, list-like, str, default None
@@ -1045,6 +1038,206 @@ def shift(self, periods=1, freq=None, axis=0, fill_value=None):
             zip(self._column_names, data_columns), self._index
         )
 
+    @_cudf_nvtx_annotate
+    def truncate(self, before=None, after=None, axis=0, copy=True):
+        """
+        Truncate a Series or DataFrame before and after some index value.
+
+        This is a useful shorthand for boolean indexing based on index
+        values above or below certain thresholds.
+
+        Parameters
+        ----------
+        before : date, str, int
+            Truncate all rows before this index value.
+        after : date, str, int
+            Truncate all rows after this index value.
+        axis : {0 or 'index', 1 or 'columns'}, optional
+            Axis to truncate. Truncates the index (rows) by default.
+        copy : bool, default is True,
+            Return a copy of the truncated section.
+
+        Returns
+        -------
+            The truncated Series or DataFrame.
+
+        Notes
+        -----
+        If the index being truncated contains only datetime values,
+        `before` and `after` may be specified as strings instead of
+        Timestamps.
+
+        .. pandas-compat::
+            **DataFrame.truncate, Series.truncate**
+
+            The ``copy`` parameter is only present for API compatibility, but
+            ``copy=False`` is not supported. This method always generates a
+            copy.
+
+        Examples
+        --------
+        **Series**
+
+        >>> import cudf
+        >>> cs1 = cudf.Series([1, 2, 3, 4])
+        >>> cs1
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+
+        >>> cs1.truncate(before=1, after=2)
+        1    2
+        2    3
+        dtype: int64
+
+        >>> import cudf
+        >>> dates = cudf.date_range(
+        ...     '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s'
+        ... )
+        >>> cs2 = cudf.Series(range(len(dates)), index=dates)
+        >>> cs2
+        2021-01-01 23:45:00     0
+        2021-01-01 23:45:01     1
+        2021-01-01 23:45:02     2
+        2021-01-01 23:45:03     3
+        2021-01-01 23:45:04     4
+        2021-01-01 23:45:05     5
+        2021-01-01 23:45:06     6
+        2021-01-01 23:45:07     7
+        2021-01-01 23:45:08     8
+        2021-01-01 23:45:09     9
+        2021-01-01 23:45:10    10
+        2021-01-01 23:45:11    11
+        2021-01-01 23:45:12    12
+        2021-01-01 23:45:13    13
+        2021-01-01 23:45:14    14
+        2021-01-01 23:45:15    15
+        2021-01-01 23:45:16    16
+        2021-01-01 23:45:17    17
+        2021-01-01 23:45:18    18
+        2021-01-01 23:45:19    19
+        2021-01-01 23:45:20    20
+        2021-01-01 23:45:21    21
+        2021-01-01 23:45:22    22
+        2021-01-01 23:45:23    23
+        2021-01-01 23:45:24    24
+        ...
+        2021-01-01 23:45:56    56
+        2021-01-01 23:45:57    57
+        2021-01-01 23:45:58    58
+        2021-01-01 23:45:59    59
+        dtype: int64
+
+
+        >>> cs2.truncate(
+        ...     before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+        ... )
+        2021-01-01 23:45:18    18
+        2021-01-01 23:45:19    19
+        2021-01-01 23:45:20    20
+        2021-01-01 23:45:21    21
+        2021-01-01 23:45:22    22
+        2021-01-01 23:45:23    23
+        2021-01-01 23:45:24    24
+        2021-01-01 23:45:25    25
+        2021-01-01 23:45:26    26
+        2021-01-01 23:45:27    27
+        dtype: int64
+
+        >>> cs3 = cudf.Series({'A': 1, 'B': 2, 'C': 3, 'D': 4})
+        >>> cs3
+        A    1
+        B    2
+        C    3
+        D    4
+        dtype: int64
+
+        >>> cs3.truncate(before='B', after='C')
+        B    2
+        C    3
+        dtype: int64
+
+        **DataFrame**
+
+        >>> df = cudf.DataFrame({
+        ...     'A': ['a', 'b', 'c', 'd', 'e'],
+        ...     'B': ['f', 'g', 'h', 'i', 'j'],
+        ...     'C': ['k', 'l', 'm', 'n', 'o']
+        ... }, index=[1, 2, 3, 4, 5])
+        >>> df
+           A  B  C
+        1  a  f  k
+        2  b  g  l
+        3  c  h  m
+        4  d  i  n
+        5  e  j  o
+
+        >>> df.truncate(before=2, after=4)
+           A  B  C
+        2  b  g  l
+        3  c  h  m
+        4  d  i  n
+
+        >>> df.truncate(before="A", after="B", axis="columns")
+           A  B
+        1  a  f
+        2  b  g
+        3  c  h
+        4  d  i
+        5  e  j
+
+        >>> import cudf
+        >>> dates = cudf.date_range(
+        ...     '2021-01-01 23:45:00', '2021-01-01 23:46:00', freq='s'
+        ... )
+        >>> df2 = cudf.DataFrame(data={'A': 1, 'B': 2}, index=dates)
+        >>> df2.head()
+                             A  B
+        2021-01-01 23:45:00  1  2
+        2021-01-01 23:45:01  1  2
+        2021-01-01 23:45:02  1  2
+        2021-01-01 23:45:03  1  2
+        2021-01-01 23:45:04  1  2
+
+        >>> df2.truncate(
+        ...     before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+        ... )
+                             A  B
+        2021-01-01 23:45:18  1  2
+        2021-01-01 23:45:19  1  2
+        2021-01-01 23:45:20  1  2
+        2021-01-01 23:45:21  1  2
+        2021-01-01 23:45:22  1  2
+        2021-01-01 23:45:23  1  2
+        2021-01-01 23:45:24  1  2
+        2021-01-01 23:45:25  1  2
+        2021-01-01 23:45:26  1  2
+        2021-01-01 23:45:27  1  2
+        """
+        if not copy:
+            raise ValueError("Truncating with copy=False is not supported.")
+        axis = self._get_axis_from_axis_arg(axis)
+        ax = self._index if axis == 0 else self._data.to_pandas_index()
+
+        if not ax.is_monotonic_increasing and not ax.is_monotonic_decreasing:
+            raise ValueError("truncate requires a sorted index")
+
+        if type(ax) is cudf.core.index.DatetimeIndex:
+            before = pd.to_datetime(before)
+            after = pd.to_datetime(after)
+
+        if before is not None and after is not None and before > after:
+            raise ValueError(f"Truncate: {after} must be after {before}")
+
+        if len(ax) > 1 and ax.is_monotonic_decreasing and ax.nunique() > 1:
+            before, after = after, before
+
+        slicer = [slice(None, None)] * self.ndim
+        slicer[axis] = slice(before, after)
+        return self.loc[tuple(slicer)].copy()
+
     @cached_property
     def loc(self):
         """Select rows and columns by label or boolean mask.
@@ -1865,7 +2058,7 @@ def sort_values(
             Sort ascending vs. descending. Specify list for multiple sort
             orders. If this is a list of bools, must match the length of the
             by.
-        na_position : {‘first’, ‘last’}, default ‘last’
+        na_position : {'first', 'last'}, default 'last'
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
@@ -4719,10 +4912,12 @@ def _drop_rows_by_labels(
     level: Union[int, str],
     errors: str,
 ) -> DataFrameOrSeries:
-    """Remove rows specified by `labels`. If `errors="raise"`, an error is raised
-    if some items in `labels` do not exist in `obj._index`.
+    """Remove rows specified by `labels`.
+
+    If `errors="raise"`, an error is raised if some items in `labels` do not
+    exist in `obj._index`.
 
-    Will raise if level(int) is greater or equal to index nlevels
+    Will raise if level(int) is greater or equal to index nlevels.
     """
     if isinstance(level, int) and level >= obj.index.nlevels:
         raise ValueError("Param level out of bounds.")
diff --git a/python/cudf/cudf/core/reshape.py b/python/cudf/cudf/core/reshape.py
index 8e5d0ece729..ba9da2bcb0c 100644
--- a/python/cudf/cudf/core/reshape.py
+++ b/python/cudf/cudf/core/reshape.py
@@ -484,7 +484,7 @@ def melt(
     4  b        C      4
     5  c        C      6
 
-    The names of ‘variable’ and ‘value’ columns can be customized:
+    The names of 'variable' and 'value' columns can be customized:
 
     >>> cudf.melt(df, id_vars=['A'], value_vars=['B'],
     ...         var_name='myVarname', value_name='myValname')
diff --git a/python/cudf/cudf/core/scalar.py b/python/cudf/cudf/core/scalar.py
index e05e8662fe4..e516177ad29 100644
--- a/python/cudf/cudf/core/scalar.py
+++ b/python/cudf/cudf/core/scalar.py
@@ -392,4 +392,6 @@ def _dispatch_scalar_unaop(self, op):
         return getattr(self.value, op)()
 
     def astype(self, dtype):
+        if self.dtype == dtype:
+            return self
         return Scalar(self.value, dtype)
diff --git a/python/cudf/cudf/core/series.py b/python/cudf/cudf/core/series.py
index f11052096e3..8f4f6fe57d6 100644
--- a/python/cudf/cudf/core/series.py
+++ b/python/cudf/cudf/core/series.py
@@ -35,6 +35,7 @@
     is_integer_dtype,
     is_list_dtype,
     is_scalar,
+    is_string_dtype,
     is_struct_dtype,
 )
 from cudf.core.abc import Serializable
@@ -214,19 +215,20 @@ def __setitem__(self, key, value):
             value = column.as_column(value)
 
         if (
-            not isinstance(
-                self._frame._column.dtype,
-                (cudf.core.dtypes.DecimalDtype, cudf.CategoricalDtype),
+            (
+                _is_non_decimal_numeric_dtype(self._frame._column.dtype)
+                or is_string_dtype(self._frame._column.dtype)
             )
             and hasattr(value, "dtype")
             and _is_non_decimal_numeric_dtype(value.dtype)
         ):
             # normalize types if necessary:
-            if not is_integer(key):
-                to_dtype = np.result_type(
-                    value.dtype, self._frame._column.dtype
-                )
-                value = value.astype(to_dtype)
+            # In contrast to Column.__setitem__ (which downcasts the value to
+            # the dtype of the column) here we upcast the series to the
+            # larger data type mimicking pandas
+            to_dtype = np.result_type(value.dtype, self._frame._column.dtype)
+            value = value.astype(to_dtype)
+            if to_dtype != self._frame._column.dtype:
                 self._frame._column._mimic_inplace(
                     self._frame._column.astype(to_dtype), inplace=True
                 )
@@ -283,6 +285,10 @@ def __setitem__(self, key, value):
         self._frame.iloc[key] = value
 
     def _loc_to_iloc(self, arg):
+        if isinstance(arg, tuple) and arg and isinstance(arg[0], slice):
+            if len(arg) > 1:
+                raise IndexError("Too many Indexers")
+            arg = arg[0]
         if _is_scalar_or_zero_d_array(arg):
             if not _is_non_decimal_numeric_dtype(self._frame.index.dtype):
                 # TODO: switch to cudf.utils.dtypes.is_integer(arg)
@@ -725,6 +731,45 @@ def drop(
             labels, axis, index, columns, level, inplace, errors
         )
 
+    @_cudf_nvtx_annotate
+    def to_dict(self, into: type[dict] = dict) -> dict:
+        """
+        Convert Series to {label -> value} dict or dict-like object.
+
+        Parameters
+        ----------
+        into : class, default dict
+            The collections.abc.Mapping subclass to use as the return
+            object. Can be the actual class or an empty
+            instance of the mapping type you want.  If you want a
+            collections.defaultdict, you must pass it initialized.
+
+        Returns
+        -------
+        collections.abc.Mapping
+            Key-value representation of Series.
+
+        Examples
+        --------
+        >>> import cudf
+        >>> s = cudf.Series([1, 2, 3, 4])
+        >>> s
+        0    1
+        1    2
+        2    3
+        3    4
+        dtype: int64
+        >>> s.to_dict()
+        {0: 1, 1: 2, 2: 3, 3: 4}
+        >>> from collections import OrderedDict, defaultdict
+        >>> s.to_dict(OrderedDict)
+        OrderedDict([(0, 1), (1, 2), (2, 3), (3, 4)])
+        >>> dd = defaultdict(list)
+        >>> s.to_dict(dd)
+        defaultdict(<class 'list'>, {0: 1, 1: 2, 2: 3, 3: 4})
+        """
+        return self.to_pandas().to_dict(into=into)
+
     @_cudf_nvtx_annotate
     def append(self, to_append, ignore_index=False, verify_integrity=False):
         """Append values from another ``Series`` or array-like object.
@@ -815,7 +860,7 @@ def reindex(self, *args, **kwargs):
         copy : boolean, default True
         level: Not Supported
         fill_value : Value to use for missing values.
-            Defaults to ``NA``, but can be any “compatible” value.
+            Defaults to ``NA``, but can be any "compatible" value.
         limit: Not Supported
         tolerance: Not Supported
 
@@ -889,7 +934,7 @@ def reindex(self, *args, **kwargs):
             DataFrame, followed by the original Series values. When `drop` is
             True, a `Series` is returned. In either case, if ``inplace=True``,
             no value is returned.
-""",
+""",  # noqa: E501
             example="""
         >>> series = cudf.Series(['a', 'b', 'c', 'd'], index=[10, 11, 12, 13])
         >>> series
@@ -1206,7 +1251,8 @@ def __repr__(self):
             and not is_decimal_dtype(preprocess.dtype)
             and not is_struct_dtype(preprocess.dtype)
         ) or isinstance(
-            preprocess._column, cudf.core.column.timedelta.TimeDeltaColumn
+            preprocess._column,
+            cudf.core.column.timedelta.TimeDeltaColumn,
         ):
             output = repr(
                 preprocess.astype("O").fillna(cudf._NA_REP).to_pandas()
@@ -1604,7 +1650,7 @@ def drop_duplicates(self, keep="first", inplace=False, ignore_index=False):
         Name: animal, dtype: object
 
         The value `False` for parameter `keep` discards all sets
-        of duplicated entries. Setting the value of ‘inplace’ to
+        of duplicated entries. Setting the value of 'inplace' to
         `True` performs the operation inplace and returns `None`.
 
         >>> s.drop_duplicates(keep=False, inplace=True)
@@ -1830,8 +1876,6 @@ def data(self):
         2    3
         3    4
         dtype: int64
-        >>> series.data
-        <cudf.core.buffer.Buffer ...>
         >>> np.array(series.data.memoryview())
         array([1, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 3, 0, 0, 0, 0, 0,
                0, 0, 4, 0, 0, 0, 0, 0, 0, 0], dtype=uint8)
@@ -1880,7 +1924,7 @@ def sort_values(
             Sort ascending vs. descending. Specify list for multiple sort
             orders. If this is a list of bools, must match the length of the
             by.
-        na_position : {‘first’, ‘last’}, default ‘last’
+        na_position : {'first', 'last'}, default 'last'
             'first' puts nulls at the beginning, 'last' puts nulls at the end
         ignore_index : bool, default False
             If True, index will not be sorted.
@@ -2762,7 +2806,7 @@ def value_counts(
             only works with numeric data.
 
         dropna : bool, default True
-            Don’t include counts of NaN and None.
+            Don't include counts of NaN and None.
 
         Returns
         -------
@@ -2885,7 +2929,7 @@ def quantile(
         ----------
         q : float or array-like, default 0.5 (50% quantile)
             0 <= q <= 1, the quantile(s) to compute
-        interpolation : {’linear’, ‘lower’, ‘higher’, ‘midpoint’, ‘nearest’}
+        interpolation : {'linear', 'lower', 'higher', 'midpoint', 'nearest'}
             This optional parameter specifies the interpolation method to use,
             when the desired quantile lies between two data points i and j:
         columns : list of str
@@ -2997,7 +3041,7 @@ def describe(
 
     @_cudf_nvtx_annotate
     def digitize(self, bins, right=False):
-        """Return the indices of the bins to which each value in series belongs.
+        """Return the indices of the bins to which each value belongs.
 
         Notes
         -----
@@ -3591,6 +3635,67 @@ def second(self):
         """
         return self._get_dt_field("second")
 
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
+    def microsecond(self):
+        """
+        The microseconds of the datetime.
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> import cudf
+        >>> datetime_series = cudf.Series(pd.date_range("2000-01-01",
+        ...         periods=3, freq="us"))
+        >>> datetime_series
+        0    2000-01-01 00:00:00.000000
+        1    2000-01-01 00:00:00.000001
+        2    2000-01-01 00:00:00.000002
+        dtype: datetime64[ns]
+        >>> datetime_series.dt.microsecond
+        0    0
+        1    1
+        2    2
+        dtype: int32
+        """
+        return Series(
+            data=(
+                # Need to manually promote column to int32 because
+                # pandas-matching binop behaviour requires that this
+                # __mul__ returns an int16 column.
+                self.series._column.get_dt_field("millisecond").astype("int32")
+                * cudf.Scalar(1000, dtype="int32")
+            )
+            + self.series._column.get_dt_field("microsecond"),
+            index=self.series._index,
+            name=self.series.name,
+        )
+
+    @property  # type: ignore
+    @_cudf_nvtx_annotate
+    def nanosecond(self):
+        """
+        The nanoseconds of the datetime.
+
+        Examples
+        --------
+        >>> import pandas as pd
+        >>> import cudf
+        >>> datetime_series = cudf.Series(pd.date_range("2000-01-01",
+        ...         periods=3, freq="ns"))
+        >>> datetime_series
+        0    2000-01-01 00:00:00.000000000
+        1    2000-01-01 00:00:00.000000001
+        2    2000-01-01 00:00:00.000000002
+        dtype: datetime64[ns]
+        >>> datetime_series.dt.nanosecond
+        0    0
+        1    1
+        2    2
+        dtype: int16
+        """
+        return self._get_dt_field("nanosecond")
+
     @property  # type: ignore
     @_cudf_nvtx_annotate
     def weekday(self):
@@ -4293,7 +4398,7 @@ def strftime(self, date_format, *args, **kwargs):
         Parameters
         ----------
         date_format : str
-            Date format string (e.g. “%Y-%m-%d”).
+            Date format string (e.g. "%Y-%m-%d").
 
         Returns
         -------
diff --git a/python/cudf/cudf/core/subword_tokenizer.py b/python/cudf/cudf/core/subword_tokenizer.py
index 2c3dff7f668..4947a922cbb 100644
--- a/python/cudf/cudf/core/subword_tokenizer.py
+++ b/python/cudf/cudf/core/subword_tokenizer.py
@@ -247,7 +247,7 @@ def _bert_add_special_tokens(token_o):
     max_length = token_o["input_ids"].shape[1]
     seq_end_col = max_length - (token_o["input_ids"][:, ::-1] != 0).argmax(1)
     # clipping to take overflow into account
-    seq_end_col = cp.clip(seq_end_col + 1, a_max=max_length - 1)
+    seq_end_col = cp.clip(seq_end_col + 1, a_min=None, a_max=max_length - 1)
 
     _bert_add_special_tokens_input_ids(token_o["input_ids"], seq_end_col)
     _bert_add_special_tokens_attention_mask(
@@ -294,4 +294,6 @@ def _bert_add_special_tokens_metadata(metadata, max_length):
     # metadata seq starts from plus 1
     metadata[:, 1] = metadata[:, 1] + 1
     # clip done to take overflow into account
-    metadata[:, 2] = cp.clip(metadata[:, 2] + 1, a_max=max_length - 2)
+    metadata[:, 2] = cp.clip(
+        metadata[:, 2] + 1, a_min=None, a_max=max_length - 2
+    )
diff --git a/python/cudf/cudf/core/udf/__init__.py b/python/cudf/cudf/core/udf/__init__.py
index 443466b28bd..8092207e037 100644
--- a/python/cudf/cudf/core/udf/__init__.py
+++ b/python/cudf/cudf/core/udf/__init__.py
@@ -1,10 +1,9 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
-import numpy as np
-from numba import cuda, types
-from numba.cuda.cudaimpl import (
-    lower as cuda_lower,
-    registry as cuda_lowering_registry,
-)
+
+from functools import lru_cache
+
+from numba import types
+from numba.cuda.cudaimpl import lower as cuda_lower
 
 from cudf.core.dtypes import dtype
 from cudf.core.udf import api, row_function, utils
@@ -23,39 +22,60 @@
     | {types.boolean}
 )
 _STRING_UDFS_ENABLED = False
+cudf_str_dtype = dtype(str)
+
+
 try:
     import strings_udf
+    from strings_udf import ptxpath
+
+    if ptxpath:
+        utils.ptx_files.append(ptxpath)
+
+        from strings_udf._lib.cudf_jit_udf import (
+            column_from_udf_string_array,
+            column_to_string_view_array,
+        )
+        from strings_udf._typing import (
+            str_view_arg_handler,
+            string_view,
+            udf_string,
+        )
 
-    if strings_udf.ENABLED:
         from . import strings_typing  # isort: skip
         from . import strings_lowering  # isort: skip
-        from strings_udf import ptxpath
-        from strings_udf._lib.cudf_jit_udf import to_string_view_array
-        from strings_udf._typing import str_view_arg_handler, string_view
 
-        # add an overload of MaskedType.__init__(string_view, bool)
-        cuda_lower(api.Masked, strings_typing.string_view, types.boolean)(
+        cuda_lower(api.Masked, string_view, types.boolean)(
             masked_lowering.masked_constructor
         )
+        utils.JIT_SUPPORTED_TYPES |= STRING_TYPES
+        _supported_masked_types |= {string_view, udf_string}
 
-        # add an overload of pack_return(string_view)
-        cuda_lower(api.pack_return, strings_typing.string_view)(
-            masked_lowering.pack_return_scalar_impl
-        )
+        @lru_cache(maxsize=None)
+        def set_initial_malloc_heap_size():
+            strings_udf.set_malloc_heap_size()
+
+        def column_to_string_view_array_init_heap(col):
+            # lazily allocate heap only when a string needs to be returned
+            set_initial_malloc_heap_size()
+            return column_to_string_view_array(col)
+
+        utils.launch_arg_getters[
+            cudf_str_dtype
+        ] = column_to_string_view_array_init_heap
+        utils.output_col_getters[cudf_str_dtype] = column_from_udf_string_array
+        utils.masked_array_types[cudf_str_dtype] = string_view
+        row_function.itemsizes[cudf_str_dtype] = string_view.size_bytes
 
-        _supported_masked_types |= {strings_typing.string_view}
-        utils.launch_arg_getters[dtype("O")] = to_string_view_array
-        utils.masked_array_types[dtype("O")] = string_view
-        utils.JIT_SUPPORTED_TYPES |= STRING_TYPES
-        utils.ptx_files.append(ptxpath)
         utils.arg_handlers.append(str_view_arg_handler)
-        row_function.itemsizes[dtype("O")] = string_view.size_bytes
+
+        masked_typing.MASKED_INIT_MAP[udf_string] = udf_string
 
         _STRING_UDFS_ENABLED = True
-    else:
-        del strings_udf
 
 except ImportError as e:
     # allow cuDF to work without strings_udf
     pass
-masked_typing.register_masked_constructor(_supported_masked_types)
+
+masked_typing._register_masked_constructor_typing(_supported_masked_types)
+masked_lowering._register_masked_constructor_lowering(_supported_masked_types)
diff --git a/python/cudf/cudf/core/udf/_ops.py b/python/cudf/cudf/core/udf/_ops.py
index 559a5bfad4f..6b0640b09ed 100644
--- a/python/cudf/cudf/core/udf/_ops.py
+++ b/python/cudf/cudf/core/udf/_ops.py
@@ -11,6 +11,13 @@
     operator.floordiv,
     operator.mod,
     operator.pow,
+    operator.iadd,
+    operator.isub,
+    operator.imul,
+    operator.itruediv,
+    operator.floordiv,
+    operator.ipow,
+    operator.imod,
 ]
 
 bitwise_ops = [operator.and_, operator.or_, operator.xor]
diff --git a/python/cudf/cudf/core/udf/masked_lowering.py b/python/cudf/cudf/core/udf/masked_lowering.py
index f825b6538bf..37f3117e756 100644
--- a/python/cudf/cudf/core/udf/masked_lowering.py
+++ b/python/cudf/cudf/core/udf/masked_lowering.py
@@ -372,10 +372,6 @@ def cast_masked_to_masked(context, builder, fromty, toty, val):
 
 
 # Masked constructor for use in a kernel for testing
-@lower_builtin(api.Masked, types.Boolean, types.boolean)
-@lower_builtin(api.Masked, types.Number, types.boolean)
-@lower_builtin(api.Masked, types.NPDatetime, types.boolean)
-@lower_builtin(api.Masked, types.NPTimedelta, types.boolean)
 def masked_constructor(context, builder, sig, args):
     ty = sig.return_type
     value, valid = args
@@ -385,6 +381,11 @@ def masked_constructor(context, builder, sig, args):
     return masked._getvalue()
 
 
+def _register_masked_constructor_lowering(supported_masked_types):
+    for ty in supported_masked_types:
+        lower_builtin(api.Masked, ty, types.boolean)(masked_constructor)
+
+
 # Allows us to make an instance of MaskedType a global variable
 # and properly use it inside functions we will later compile
 @cuda_lowering_registry.lower_constant(MaskedType)
diff --git a/python/cudf/cudf/core/udf/masked_typing.py b/python/cudf/cudf/core/udf/masked_typing.py
index a815a9f6dae..7baf2d585e2 100644
--- a/python/cudf/cudf/core/udf/masked_typing.py
+++ b/python/cudf/cudf/core/udf/masked_typing.py
@@ -169,7 +169,7 @@ def typeof_masked(val, c):
 
 # Implemented typing for Masked(value, valid) - the construction of a Masked
 # type in a kernel.
-def register_masked_constructor(supported_masked_types):
+def _register_masked_constructor_typing(supported_masked_types):
     class MaskedConstructor(ConcreteTemplate):
         key = api.Masked
         cases = [
diff --git a/python/cudf/cudf/core/udf/strings_lowering.py b/python/cudf/cudf/core/udf/strings_lowering.py
index 59041977f87..465866cdd55 100644
--- a/python/cudf/cudf/core/udf/strings_lowering.py
+++ b/python/cudf/cudf/core/udf/strings_lowering.py
@@ -7,7 +7,7 @@
 from numba.core.typing import signature as nb_signature
 from numba.cuda.cudaimpl import lower as cuda_lower
 
-from strings_udf._typing import size_type, string_view
+from strings_udf._typing import size_type, string_view, udf_string
 from strings_udf.lowering import (
     contains_impl,
     count_impl,
@@ -22,8 +22,13 @@
     istitle_impl,
     isupper_impl,
     len_impl,
+    lower_impl,
+    lstrip_impl,
     rfind_impl,
+    rstrip_impl,
     startswith_impl,
+    strip_impl,
+    upper_impl,
 )
 
 from cudf.core.udf.masked_typing import MaskedType
@@ -79,18 +84,6 @@ def masked_binary_func_impl(context, builder, sig, args):
     )
 
 
-create_binary_string_func(
-    "MaskedType.startswith",
-    startswith_impl,
-    types.boolean,
-)
-create_binary_string_func("MaskedType.endswith", endswith_impl, types.boolean)
-create_binary_string_func("MaskedType.find", find_impl, size_type)
-create_binary_string_func("MaskedType.rfind", rfind_impl, size_type)
-create_binary_string_func("MaskedType.count", count_impl, size_type)
-create_binary_string_func(operator.contains, contains_impl, types.boolean)
-
-
 def create_masked_unary_identifier_func(op, cuda_func):
     """
     Provide a wrapper around numba's low-level extension API which
@@ -117,6 +110,41 @@ def masked_unary_func_impl(context, builder, sig, args):
     cuda_lower(op, MaskedType(string_view))(masked_unary_func_impl)
 
 
+def create_masked_upper_or_lower(op, cuda_func):
+    def upper_or_lower_impl(context, builder, sig, args):
+        ret = cgutils.create_struct_proxy(sig.return_type)(context, builder)
+        masked_str = cgutils.create_struct_proxy(sig.args[0])(
+            context, builder, value=args[0]
+        )
+
+        result = cuda_func(
+            context,
+            builder,
+            udf_string(string_view),
+            (masked_str.value,),
+        )
+        ret.value = result
+        ret.valid = masked_str.valid
+        return ret._getvalue()
+
+    cuda_lower(op, MaskedType(string_view))(upper_or_lower_impl)
+
+
+create_binary_string_func("MaskedType.strip", strip_impl, udf_string)
+create_binary_string_func("MaskedType.lstrip", lstrip_impl, udf_string)
+create_binary_string_func("MaskedType.rstrip", rstrip_impl, udf_string)
+create_binary_string_func(
+    "MaskedType.startswith",
+    startswith_impl,
+    types.boolean,
+)
+create_binary_string_func("MaskedType.endswith", endswith_impl, types.boolean)
+create_binary_string_func("MaskedType.find", find_impl, size_type)
+create_binary_string_func("MaskedType.rfind", rfind_impl, size_type)
+create_binary_string_func("MaskedType.count", count_impl, size_type)
+create_binary_string_func(operator.contains, contains_impl, types.boolean)
+
+
 create_masked_unary_identifier_func("MaskedType.isalnum", isalnum_impl)
 create_masked_unary_identifier_func("MaskedType.isalpha", isalpha_impl)
 create_masked_unary_identifier_func("MaskedType.isdigit", isdigit_impl)
@@ -125,3 +153,5 @@ def masked_unary_func_impl(context, builder, sig, args):
 create_masked_unary_identifier_func("MaskedType.isspace", isspace_impl)
 create_masked_unary_identifier_func("MaskedType.isdecimal", isdecimal_impl)
 create_masked_unary_identifier_func("MaskedType.istitle", istitle_impl)
+create_masked_upper_or_lower("MaskedType.upper", upper_impl)
+create_masked_upper_or_lower("MaskedType.lower", lower_impl)
diff --git a/python/cudf/cudf/core/udf/strings_typing.py b/python/cudf/cudf/core/udf/strings_typing.py
index 1179688651f..87500cba564 100644
--- a/python/cudf/cudf/core/udf/strings_typing.py
+++ b/python/cudf/cudf/core/udf/strings_typing.py
@@ -13,7 +13,10 @@
     id_unary_funcs,
     int_binary_funcs,
     size_type,
+    string_return_attrs,
+    string_unary_funcs,
     string_view,
+    udf_string,
 )
 
 from cudf.core.udf import masked_typing
@@ -57,6 +60,16 @@ def len_typing(self, args, kws):
         return nb_signature(size_type, args[0])
 
 
+@register_string_function(operator.add)
+def concat_typing(self, args, kws):
+    if _is_valid_string_arg(args[0]) and _is_valid_string_arg(args[1]):
+        return nb_signature(
+            MaskedType(udf_string),
+            MaskedType(string_view),
+            MaskedType(string_view),
+        )
+
+
 @register_string_function(operator.contains)
 def contains_typing(self, args, kws):
     if _is_valid_string_arg(args[0]) and _is_valid_string_arg(args[1]):
@@ -111,7 +124,7 @@ def attr(self, mod):
     return attr
 
 
-def create_masked_identifier_attr(attrname):
+def create_masked_unary_attr(attrname, retty):
     """
     Helper function wrapping numba's low level extension API. Provides
     the boilerplate needed to register a unary function of a masked
@@ -122,7 +135,7 @@ class MaskedStringViewIdentifierAttr(AbstractTemplate):
         key = attrname
 
         def generic(self, args, kws):
-            return nb_signature(MaskedType(types.boolean), recvr=self.this)
+            return nb_signature(MaskedType(retty), recvr=self.this)
 
     def attr(self, mod):
         return types.BoundFunction(
@@ -172,11 +185,25 @@ def resolve_valid(self, mod):
         create_masked_binary_attr(f"MaskedType.{func}", size_type),
     )
 
+for func in string_return_attrs:
+    setattr(
+        MaskedStringViewAttrs,
+        f"resolve_{func}",
+        create_masked_binary_attr(f"MaskedType.{func}", udf_string),
+    )
+
 for func in id_unary_funcs:
     setattr(
         MaskedStringViewAttrs,
         f"resolve_{func}",
-        create_masked_identifier_attr(f"MaskedType.{func}"),
+        create_masked_unary_attr(f"MaskedType.{func}", types.boolean),
+    )
+
+for func in string_unary_funcs:
+    setattr(
+        MaskedStringViewAttrs,
+        f"resolve_{func}",
+        create_masked_unary_attr(f"MaskedType.{func}", udf_string),
     )
 
 cuda_decl_registry.register_attr(MaskedStringViewAttrs)
diff --git a/python/cudf/cudf/core/udf/strings_utils.py b/python/cudf/cudf/core/udf/strings_utils.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/python/cudf/cudf/core/udf/utils.py b/python/cudf/cudf/core/udf/utils.py
index fa79088046c..4d40d41f9c3 100644
--- a/python/cudf/cudf/core/udf/utils.py
+++ b/python/cudf/cudf/core/udf/utils.py
@@ -10,6 +10,8 @@
 from numba.np import numpy_support
 from numba.types import CPointer, Poison, Tuple, boolean, int64, void
 
+import rmm
+
 from cudf.core.column.column import as_column
 from cudf.core.dtypes import CategoricalDtype
 from cudf.core.udf.masked_typing import MaskedType
@@ -31,6 +33,9 @@
 precompiled: cachetools.LRUCache = cachetools.LRUCache(maxsize=32)
 arg_handlers: List[Any] = []
 ptx_files: List[Any] = []
+masked_array_types: Dict[Any, Any] = {}
+launch_arg_getters: Dict[Any, Any] = {}
+output_col_getters: Dict[Any, Any] = {}
 
 
 @_cudf_nvtx_annotate
@@ -54,6 +59,7 @@ def _get_udf_return_type(argty, func: Callable, args=()):
     # Get the return type. The PTX is also returned by compile_udf, but is not
     # needed here.
     ptx, output_type = cudautils.compile_udf(func, compile_sig)
+
     if not isinstance(output_type, MaskedType):
         numba_output_type = numpy_support.from_dtype(np.dtype(output_type))
     else:
@@ -64,6 +70,7 @@ def _get_udf_return_type(argty, func: Callable, args=()):
         if not isinstance(numba_output_type, MaskedType)
         else numba_output_type.value_type
     )
+    result = result if result.is_internal else result.return_type
 
     # _get_udf_return_type will throw a TypingError if the user tries to use
     # a field in the row containing an unsupported dtype, except in the
@@ -112,9 +119,6 @@ def _supported_cols_from_frame(frame):
     }
 
 
-masked_array_types: Dict[Any, Any] = {}
-
-
 def _masked_array_type_from_col(col):
     """
     Return a type representing a tuple of arrays,
@@ -142,9 +146,12 @@ def _construct_signature(frame, return_type, args):
     actually JIT the kernel itself later, accounting for types
     and offsets. Skips columns with unsupported dtypes.
     """
-
+    if not return_type.is_internal:
+        return_type = CPointer(return_type)
+    else:
+        return_type = return_type[::1]
     # Tuple of arrays, first the output data array, then the mask
-    return_type = Tuple((return_type[::1], boolean[::1]))
+    return_type = Tuple((return_type, boolean[::1]))
     offsets = []
     sig = [return_type, int64]
     for col in _supported_cols_from_frame(frame).values():
@@ -213,7 +220,12 @@ def _compile_or_get(frame, func, args, kernel_getter=None):
     # could be a MaskedType or a scalar type.
 
     kernel, scalar_return_type = kernel_getter(frame, func, args)
-    np_return_type = numpy_support.as_dtype(scalar_return_type)
+    np_return_type = (
+        numpy_support.as_dtype(scalar_return_type)
+        if scalar_return_type.is_internal
+        else scalar_return_type.np_dtype
+    )
+
     precompiled[cache_key] = (kernel, np_return_type)
 
     return kernel, np_return_type
@@ -230,9 +242,6 @@ def _get_kernel(kernel_string, globals_, sig, func):
     return kernel
 
 
-launch_arg_getters: Dict[Any, Any] = {}
-
-
 def _get_input_args_from_frame(fr):
     args = []
     offsets = []
@@ -254,8 +263,12 @@ def _get_input_args_from_frame(fr):
 
 
 def _return_arr_from_dtype(dt, size):
+    if extensionty := masked_array_types.get(dt):
+        return rmm.DeviceBuffer(size=size * extensionty.return_type.size_bytes)
     return cp.empty(size, dtype=dt)
 
 
 def _post_process_output_col(col, retty):
+    if getter := output_col_getters.get(retty):
+        col = getter(col)
     return as_column(col, retty)
diff --git a/python/cudf/cudf/io/avro.py b/python/cudf/cudf/io/avro.py
index 66c5c1c5a56..aaafe60d03f 100644
--- a/python/cudf/cudf/io/avro.py
+++ b/python/cudf/cudf/io/avro.py
@@ -1,4 +1,7 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
+import warnings
+
 import cudf
 from cudf import _lib as libcudf
 from cudf.utils import ioutils
@@ -11,13 +14,13 @@ def read_avro(
     columns=None,
     skiprows=None,
     num_rows=None,
-    **kwargs,
+    storage_options=None,
 ):
     """{docstring}"""
 
     is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
-        **kwargs,
+        storage_options=storage_options,
     )
     if not is_single_filepath_or_buffer:
         raise NotImplementedError(
@@ -25,12 +28,19 @@ def read_avro(
         )
 
     filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
-        path_or_data=filepath_or_buffer, compression=None, **kwargs
+        path_or_data=filepath_or_buffer,
+        compression=None,
+        storage_options=storage_options,
     )
     if compression is not None:
         ValueError("URL content-encoding decompression is not supported")
 
     if engine == "cudf":
+        warnings.warn(
+            "The `engine` parameter is deprecated and will be removed in a "
+            "future release",
+            FutureWarning,
+        )
         return cudf.DataFrame._from_data(
             *libcudf.avro.read_avro(
                 filepath_or_buffer, columns, skiprows, num_rows
diff --git a/python/cudf/cudf/io/csv.py b/python/cudf/cudf/io/csv.py
index 0adf432c31d..1eacbbb4458 100644
--- a/python/cudf/cudf/io/csv.py
+++ b/python/cudf/cudf/io/csv.py
@@ -61,6 +61,9 @@ def read_csv(
             "`use_python_file_object=False`"
         )
 
+    if bytes_per_thread is None:
+        bytes_per_thread = ioutils._BYTES_PER_THREAD_DEFAULT
+
     is_single_filepath_or_buffer = ioutils.ensure_single_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         storage_options=storage_options,
@@ -76,9 +79,7 @@ def read_csv(
         iotypes=(BytesIO, StringIO, NativeFile),
         use_python_file_object=use_python_file_object,
         storage_options=storage_options,
-        bytes_per_thread=256_000_000
-        if bytes_per_thread is None
-        else bytes_per_thread,
+        bytes_per_thread=bytes_per_thread,
     )
 
     if na_values is not None and is_scalar(na_values):
diff --git a/python/cudf/cudf/io/json.py b/python/cudf/cudf/io/json.py
index 2a0ae565974..0ae02dcb62b 100644
--- a/python/cudf/cudf/io/json.py
+++ b/python/cudf/cudf/io/json.py
@@ -1,4 +1,5 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
+
 import warnings
 from collections import abc
 from io import BytesIO, StringIO
@@ -17,22 +18,23 @@
 def read_json(
     path_or_buf,
     engine="auto",
-    dtype=True,
+    orient=None,
+    dtype=None,
     lines=False,
     compression="infer",
     byte_range=None,
     keep_quotes=False,
+    storage_options=None,
     *args,
     **kwargs,
 ):
     """{docstring}"""
 
-    if not isinstance(dtype, (abc.Mapping, bool)):
-        warnings.warn(
-            "passing 'dtype' as list is deprecated, instead pass "
-            "a dict of column name and types key-value paris."
-            "in future versions 'dtype' can only be a dict or bool",
-            FutureWarning,
+    if dtype is not None and not isinstance(dtype, (abc.Mapping, bool)):
+        raise TypeError(
+            "'dtype' parameter only supports "
+            "a dict of column names and types as key-value pairs, "
+            f"or a bool, or None. Got {type(dtype)}"
         )
 
     if engine == "cudf" and not lines:
@@ -45,6 +47,20 @@ def read_json(
     if engine == "auto":
         engine = "cudf" if lines else "pandas"
     if engine == "cudf" or engine == "cudf_experimental":
+        if dtype is None:
+            dtype = True
+
+        if kwargs:
+            raise ValueError(
+                "cudf engine doesn't support the "
+                f"following keyword arguments: {list(kwargs.keys())}"
+            )
+        if args:
+            raise ValueError(
+                "cudf engine doesn't support the "
+                f"following positional arguments: {list(args)}"
+            )
+
         # Multiple sources are passed as a list. If a single source is passed,
         # wrap it in a list for unified processing downstream.
         if not is_list_like(path_or_buf):
@@ -52,9 +68,13 @@ def read_json(
 
         filepaths_or_buffers = []
         for source in path_or_buf:
-            if ioutils.is_directory(source, **kwargs):
+            if ioutils.is_directory(
+                path_or_data=source, storage_options=storage_options
+            ):
                 fs = ioutils._ensure_filesystem(
-                    passed_filesystem=None, path=source, **kwargs
+                    passed_filesystem=None,
+                    path=source,
+                    storage_options=storage_options,
                 )
                 source = ioutils.stringify_pathlike(source)
                 source = fs.sep.join([source, "*.json"])
@@ -64,7 +84,7 @@ def read_json(
                 compression=compression,
                 iotypes=(BytesIO, StringIO),
                 allow_raw_text_input=True,
-                **kwargs,
+                storage_options=storage_options,
             )
             if isinstance(tmp_source, list):
                 filepaths_or_buffers.extend(tmp_source)
@@ -88,7 +108,7 @@ def read_json(
 
         if not ioutils.ensure_single_filepath_or_buffer(
             path_or_data=path_or_buf,
-            **kwargs,
+            storage_options=storage_options,
         ):
             raise NotImplementedError(
                 "`read_json` does not yet support reading "
@@ -100,28 +120,24 @@ def read_json(
             compression=compression,
             iotypes=(BytesIO, StringIO),
             allow_raw_text_input=True,
-            **kwargs,
+            storage_options=storage_options,
         )
 
-        if kwargs.get("orient") == "table":
-            pd_value = pd.read_json(
-                path_or_buf,
-                lines=lines,
-                compression=compression,
-                *args,
-                **kwargs,
-            )
-        else:
-            pd_value = pd.read_json(
-                path_or_buf,
-                lines=lines,
-                dtype=dtype,
-                compression=compression,
-                *args,
-                **kwargs,
-            )
+        pd_value = pd.read_json(
+            path_or_buf,
+            lines=lines,
+            dtype=dtype,
+            compression=compression,
+            storage_options=storage_options,
+            orient=orient,
+            *args,
+            **kwargs,
+        )
         df = cudf.from_pandas(pd_value)
 
+    if dtype is None:
+        dtype = True
+
     if dtype is True or isinstance(dtype, abc.Mapping):
         # There exists some dtypes in the result columns that is inferred.
         # Find them and map them to the default dtypes.
diff --git a/python/cudf/cudf/io/orc.py b/python/cudf/cudf/io/orc.py
index 718b9c4144f..8865bdd9d33 100644
--- a/python/cudf/cudf/io/orc.py
+++ b/python/cudf/cudf/io/orc.py
@@ -289,7 +289,8 @@ def read_orc(
     use_index=True,
     timestamp_type=None,
     use_python_file_object=True,
-    **kwargs,
+    storage_options=None,
+    bytes_per_thread=None,
 ):
     """{docstring}"""
     from cudf import DataFrame
@@ -326,11 +327,13 @@ def read_orc(
 
     filepaths_or_buffers = []
     for source in filepath_or_buffer:
-        if ioutils.is_directory(source, **kwargs):
+        if ioutils.is_directory(
+            path_or_data=source, storage_options=storage_options
+        ):
             fs = ioutils._ensure_filesystem(
                 passed_filesystem=None,
                 path=source,
-                **kwargs,
+                storage_options=storage_options,
             )
             source = stringify_path(source)
             source = fs.sep.join([source, "*.orc"])
@@ -339,7 +342,8 @@ def read_orc(
             path_or_data=source,
             compression=None,
             use_python_file_object=use_python_file_object,
-            **kwargs,
+            storage_options=storage_options,
+            bytes_per_thread=bytes_per_thread,
         )
         if compression is not None:
             raise ValueError(
@@ -413,16 +417,12 @@ def to_orc(
     stripe_size_rows=None,
     row_index_stride=None,
     cols_as_map_type=None,
-    **kwargs,
+    storage_options=None,
+    index=None,
 ):
     """{docstring}"""
 
     for col in df._data.columns:
-        if isinstance(col, cudf.core.column.StructColumn):
-            warnings.warn(
-                "Support for writing tables with struct columns is "
-                "currently experimental."
-            )
         if isinstance(col, cudf.core.column.CategoricalColumn):
             raise NotImplementedError(
                 "Writing to ORC format is not yet supported with "
@@ -439,7 +439,7 @@ def to_orc(
         raise TypeError("cols_as_map_type must be a list of column names.")
 
     path_or_buf = ioutils.get_writer_filepath_or_buffer(
-        path_or_data=fname, mode="wb", **kwargs
+        path_or_data=fname, mode="wb", storage_options=storage_options
     )
     if ioutils.is_fsspec_open_file(path_or_buf):
         with path_or_buf as file_obj:
@@ -453,6 +453,7 @@ def to_orc(
                 stripe_size_rows,
                 row_index_stride,
                 cols_as_map_type,
+                index,
             )
     else:
         liborc.write_orc(
@@ -464,6 +465,7 @@ def to_orc(
             stripe_size_rows,
             row_index_stride,
             cols_as_map_type,
+            index,
         )
 
 
diff --git a/python/cudf/cudf/io/parquet.py b/python/cudf/cudf/io/parquet.py
index 7ac391c5f3d..ceb08cb8058 100644
--- a/python/cudf/cudf/io/parquet.py
+++ b/python/cudf/cudf/io/parquet.py
@@ -9,7 +9,6 @@
 from typing import Dict, List, Tuple
 from uuid import uuid4
 
-import numpy as np
 from pyarrow import dataset as ds, parquet as pq
 
 import cudf
@@ -54,12 +53,12 @@ def _write_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
-    row_group_size_bytes=None,
+    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
     partitions_info=None,
-    **kwargs,
+    storage_options=None,
 ):
     if is_list_like(paths) and len(paths) > 1:
         if partitions_info is None:
@@ -73,7 +72,9 @@ def _write_parquet(
             ValueError("paths must be list-like when partitions_info provided")
 
     paths_or_bufs = [
-        ioutils.get_writer_filepath_or_buffer(path, mode="wb", **kwargs)
+        ioutils.get_writer_filepath_or_buffer(
+            path_or_data=path, mode="wb", storage_options=storage_options
+        )
         for path in paths
     ]
     common_args = {
@@ -111,12 +112,19 @@ def _write_parquet(
 def write_to_dataset(
     df,
     root_path,
+    compression="snappy",
     filename=None,
     partition_cols=None,
     fs=None,
     preserve_index=False,
     return_metadata=False,
-    **kwargs,
+    statistics="ROWGROUP",
+    int96_timestamps=False,
+    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_rows=None,
+    max_page_size_bytes=None,
+    max_page_size_rows=None,
+    storage_options=None,
 ):
     """Wraps `to_parquet` to write partitioned Parquet datasets.
     For each combination of partition group and value,
@@ -136,25 +144,51 @@ def write_to_dataset(
     df : cudf.DataFrame
     root_path : string,
         The root directory of the dataset
+    compression : {'snappy', 'ZSTD', None}, default 'snappy'
+        Name of the compression to use. Use ``None`` for no compression.
     filename : string, default None
         The file name to use (within each partition directory). If None,
         a random uuid4 hex string will be used for each file name.
+    partition_cols : list,
+        Column names by which to partition the dataset.
+        Columns are partitioned in the order they are given.
     fs : FileSystem, default None
         If nothing passed, paths assumed to be found in the local on-disk
         filesystem
     preserve_index : bool, default False
         Preserve index values in each parquet file.
-    partition_cols : list,
-        Column names by which to partition the dataset
-        Columns are partitioned in the order they are given
     return_metadata : bool, default False
         Return parquet metadata for written data. Returned metadata will
         include the file-path metadata (relative to `root_path`).
-    **kwargs : dict,
-        kwargs for to_parquet function.
+    int96_timestamps : bool, default False
+        If ``True``, write timestamps in int96 format. This will convert
+        timestamps from timestamp[ns], timestamp[ms], timestamp[s], and
+        timestamp[us] to the int96 format, which is the number of Julian
+        days and the number of nanoseconds since midnight of 1970-01-01.
+        If ``False``, timestamps will not be altered.
+    row_group_size_bytes: integer or None, default None
+        Maximum size of each stripe of the output.
+        If None, 134217728 (128MB) will be used.
+    row_group_size_rows: integer or None, default None
+        Maximum number of rows of each stripe of the output.
+        If None, 1000000 will be used.
+    max_page_size_bytes: integer or None, default None
+        Maximum uncompressed size of each page of the output.
+        If None, 524288 (512KB) will be used.
+    max_page_size_rows: integer or None, default None
+        Maximum number of rows of each page of the output.
+        If None, 20000 will be used.
+
+    storage_options : dict, optional, default None
+        Extra options that make sense for a particular storage connection,
+        e.g. host, port, username, password, etc. For HTTP(S) URLs the
+        key-value pairs are forwarded to ``urllib.request.Request`` as
+        header options. For other URLs (e.g. starting with "s3://", and
+        "gcs://") the key-value pairs are forwarded to ``fsspec.open``.
+        Please see ``fsspec`` and ``urllib`` for more details.
     """
 
-    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
+    fs = ioutils._ensure_filesystem(fs, root_path, storage_options)
     fs.mkdirs(root_path, exist_ok=True)
 
     if partition_cols is not None and len(partition_cols) > 0:
@@ -166,31 +200,50 @@ def write_to_dataset(
             part_offsets,
             _,
         ) = _get_partitioned(
-            df,
-            root_path,
-            partition_cols,
-            filename,
-            fs,
-            preserve_index,
-            **kwargs,
+            df=df,
+            root_path=root_path,
+            partition_cols=partition_cols,
+            filename=filename,
+            fs=fs,
+            preserve_index=preserve_index,
+            storage_options=storage_options,
         )
-
-        if return_metadata:
-            kwargs["metadata_file_path"] = metadata_file_paths
+        metadata_file_path = metadata_file_paths if return_metadata else None
         metadata = to_parquet(
-            grouped_df,
-            full_paths,
+            df=grouped_df,
+            path=full_paths,
+            compression=compression,
             index=preserve_index,
             partition_offsets=part_offsets,
-            **kwargs,
+            storage_options=storage_options,
+            metadata_file_path=metadata_file_path,
+            statistics=statistics,
+            int96_timestamps=int96_timestamps,
+            row_group_size_bytes=row_group_size_bytes,
+            row_group_size_rows=row_group_size_rows,
+            max_page_size_bytes=max_page_size_bytes,
+            max_page_size_rows=max_page_size_rows,
         )
 
     else:
         filename = filename or _generate_filename()
         full_path = fs.sep.join([root_path, filename])
-        if return_metadata:
-            kwargs["metadata_file_path"] = filename
-        metadata = df.to_parquet(full_path, index=preserve_index, **kwargs)
+
+        metadata_file_path = filename if return_metadata else None
+
+        metadata = df.to_parquet(
+            path=full_path,
+            compression=compression,
+            index=preserve_index,
+            storage_options=storage_options,
+            metadata_file_path=metadata_file_path,
+            statistics=statistics,
+            int96_timestamps=int96_timestamps,
+            row_group_size_bytes=row_group_size_bytes,
+            row_group_size_rows=row_group_size_rows,
+            max_page_size_bytes=max_page_size_bytes,
+            max_page_size_rows=max_page_size_rows,
+        )
 
     return metadata
 
@@ -361,6 +414,7 @@ def read_parquet(
     filepath_or_buffer,
     engine="cudf",
     columns=None,
+    storage_options=None,
     filters=None,
     row_groups=None,
     strings_to_categorical=False,
@@ -368,6 +422,7 @@ def read_parquet(
     use_python_file_object=True,
     categorical_partitions=True,
     open_file_options=None,
+    bytes_per_thread=None,
     *args,
     **kwargs,
 ):
@@ -383,6 +438,9 @@ def read_parquet(
             )
         open_file_options = {}
 
+    if bytes_per_thread is None:
+        bytes_per_thread = ioutils._BYTES_PER_THREAD_DEFAULT
+
     # Multiple sources are passed as a list. If a single source is passed,
     # wrap it in a list for unified processing downstream.
     if not is_list_like(filepath_or_buffer):
@@ -403,7 +461,9 @@ def read_parquet(
 
     # Start by trying construct a filesystem object, so we
     # can apply filters on remote file-systems
-    fs, paths = ioutils._get_filesystem_and_paths(filepath_or_buffer, **kwargs)
+    fs, paths = ioutils._get_filesystem_and_paths(
+        path_or_data=filepath_or_buffer, storage_options=storage_options
+    )
 
     # Use pyarrow dataset to detect/process directory-partitioned
     # data and apply filters. Note that we can only support partitioned
@@ -418,8 +478,8 @@ def read_parquet(
             partition_keys,
             partition_categories,
         ) = _process_dataset(
-            paths,
-            fs,
+            paths=paths,
+            fs=fs,
             filters=filters,
             row_groups=row_groups,
             categorical_partitions=categorical_partitions,
@@ -431,19 +491,20 @@ def read_parquet(
     filepaths_or_buffers = []
     if use_python_file_object:
         open_file_options = _default_open_file_options(
-            open_file_options,
-            columns,
-            row_groups,
+            open_file_options=open_file_options,
+            columns=columns,
+            row_groups=row_groups,
             fs=fs,
         )
-    for i, source in enumerate(filepath_or_buffer):
+    for source in filepath_or_buffer:
         tmp_source, compression = ioutils.get_reader_filepath_or_buffer(
             path_or_data=source,
             compression=None,
             fs=fs,
             use_python_file_object=use_python_file_object,
             open_file_options=open_file_options,
-            **kwargs,
+            storage_options=storage_options,
+            bytes_per_thread=bytes_per_thread,
         )
 
         if compression is not None:
@@ -571,6 +632,16 @@ def _read_parquet(
     # Simple helper function to dispatch between
     # cudf and pyarrow to read parquet data
     if engine == "cudf":
+        if kwargs:
+            raise ValueError(
+                "cudf engine doesn't support the "
+                f"following keyword arguments: {list(kwargs.keys())}"
+            )
+        if args:
+            raise ValueError(
+                "cudf engine doesn't support the "
+                f"following positional arguments: {list(args)}"
+            )
         return libparquet.read_parquet(
             filepaths_or_buffers,
             columns=columns,
@@ -600,16 +671,28 @@ def to_parquet(
     statistics="ROWGROUP",
     metadata_file_path=None,
     int96_timestamps=False,
-    row_group_size_bytes=None,
+    row_group_size_bytes=ioutils._ROW_GROUP_SIZE_BYTES_DEFAULT,
     row_group_size_rows=None,
     max_page_size_bytes=None,
     max_page_size_rows=None,
+    storage_options=None,
+    return_metadata=False,
     *args,
     **kwargs,
 ):
     """{docstring}"""
 
     if engine == "cudf":
+        if kwargs:
+            raise ValueError(
+                "cudf engine doesn't support the "
+                f"following keyword arguments: {list(kwargs.keys())}"
+            )
+        if args:
+            raise ValueError(
+                "cudf engine doesn't support the "
+                f"following positional arguments: {list(args)}"
+            )
         # Ensure that no columns dtype is 'category'
         for col in df._column_names:
             if partition_cols is None or col not in partition_cols:
@@ -626,34 +709,32 @@ def to_parquet(
                     "partition_cols are provided. To request returning the "
                     "metadata binary blob, pass `return_metadata=True`"
                 )
-            kwargs.update(
-                {
-                    "compression": compression,
-                    "statistics": statistics,
-                    "int96_timestamps": int96_timestamps,
-                    "row_group_size_bytes": row_group_size_bytes,
-                    "row_group_size_rows": row_group_size_rows,
-                    "max_page_size_bytes": max_page_size_bytes,
-                    "max_page_size_rows": max_page_size_rows,
-                }
-            )
+
             return write_to_dataset(
                 df,
                 filename=partition_file_name,
                 partition_cols=partition_cols,
                 root_path=path,
                 preserve_index=index,
-                **kwargs,
+                compression=compression,
+                statistics=statistics,
+                int96_timestamps=int96_timestamps,
+                row_group_size_bytes=row_group_size_bytes,
+                row_group_size_rows=row_group_size_rows,
+                max_page_size_bytes=max_page_size_bytes,
+                max_page_size_rows=max_page_size_rows,
+                return_metadata=return_metadata,
+                storage_options=storage_options,
             )
 
-        if partition_offsets:
-            kwargs["partitions_info"] = list(
-                zip(
-                    partition_offsets,
-                    np.roll(partition_offsets, -1) - partition_offsets,
-                )
-            )[:-1]
-
+        partition_info = (
+            [
+                (i, j - i)
+                for i, j in zip(partition_offsets, partition_offsets[1:])
+            ]
+            if partition_offsets is not None
+            else None
+        )
         return _write_parquet(
             df,
             paths=path if is_list_like(path) else [path],
@@ -666,7 +747,8 @@ def to_parquet(
             row_group_size_rows=row_group_size_rows,
             max_page_size_bytes=max_page_size_bytes,
             max_page_size_rows=max_page_size_rows,
-            **kwargs,
+            partitions_info=partition_info,
+            storage_options=storage_options,
         )
 
     else:
@@ -730,9 +812,11 @@ def _get_partitioned(
     filename=None,
     fs=None,
     preserve_index=False,
-    **kwargs,
+    storage_options=None,
 ):
-    fs = ioutils._ensure_filesystem(fs, root_path, **kwargs)
+    fs = ioutils._ensure_filesystem(
+        fs, root_path, storage_options=storage_options
+    )
     fs.mkdirs(root_path, exist_ok=True)
 
     part_names, grouped_df, part_offsets = _get_groups_and_offsets(
@@ -872,6 +956,13 @@ class ParquetDatasetWriter:
     file_name_prefix : str
         This is a prefix to file names generated only when
         `max_file_size` is specified.
+    storage_options : dict, optional, default None
+        Extra options that make sense for a particular storage connection,
+        e.g. host, port, username, password, etc. For HTTP(S) URLs the
+        key-value pairs are forwarded to ``urllib.request.Request`` as
+        header options. For other URLs (e.g. starting with "s3://", and
+        "gcs://") the key-value pairs are forwarded to ``fsspec.open``.
+        Please see ``fsspec`` and ``urllib`` for more details.
 
 
     Examples
@@ -915,7 +1006,7 @@ def __init__(
         statistics="ROWGROUP",
         max_file_size=None,
         file_name_prefix=None,
-        **kwargs,
+        storage_options=None,
     ) -> None:
         if isinstance(path, str) and path.startswith("s3://"):
             self.fs_meta = {"is_s3": True, "actual_path": path}
@@ -938,7 +1029,7 @@ def __init__(
         # Map of partition_col values to their ParquetWriter's index
         # in self._chunked_writers for reverse lookup
         self.path_cw_map: Dict[str, int] = {}
-        self.kwargs = kwargs
+        self.storage_options = storage_options
         self.filename = file_name_prefix
         self.max_file_size = max_file_size
         if max_file_size is not None:
@@ -961,7 +1052,7 @@ def write_table(self, df):
             partition_cols=self.partition_cols,
             preserve_index=self.common_args["index"],
         )
-        fs = ioutils._ensure_filesystem(None, self.path)
+        fs = ioutils._ensure_filesystem(None, self.path, None)
         fs.mkdirs(self.path, exist_ok=True)
 
         full_paths = []
@@ -1044,10 +1135,11 @@ def write_table(self, df):
         )
         existing_cw_batch = defaultdict(dict)
         new_cw_paths = []
+        partition_info = [(i, j - i) for i, j in zip(offsets, offsets[1:])]
 
         for path, part_info, meta_path in zip(
             paths,
-            zip(offsets, np.roll(offsets, -1) - offsets),
+            partition_info,
             metadata_file_paths,
         ):
             if path in self.path_cw_map:  # path is a currently open file
@@ -1097,7 +1189,7 @@ def close(self, return_metadata=False):
             local_path = self.path
             s3_path = self.fs_meta["actual_path"]
             s3_file, _ = ioutils._get_filesystem_and_paths(
-                s3_path, **self.kwargs
+                s3_path, storage_options=self.storage_options
             )
             s3_file.put(local_path, s3_path, recursive=True)
             shutil.rmtree(self.path)
diff --git a/python/cudf/cudf/io/text.py b/python/cudf/cudf/io/text.py
index 12aa0f6ef8b..eb2c7fa7ef6 100644
--- a/python/cudf/cudf/io/text.py
+++ b/python/cudf/cudf/io/text.py
@@ -14,19 +14,30 @@ def read_text(
     filepath_or_buffer,
     delimiter=None,
     byte_range=None,
-    **kwargs,
+    strip_delimiters=False,
+    compression=None,
+    compression_offsets=None,
+    storage_options=None,
 ):
     """{docstring}"""
 
-    filepath_or_buffer, compression = ioutils.get_reader_filepath_or_buffer(
+    if delimiter is None:
+        raise ValueError("delimiter needs to be provided")
+
+    filepath_or_buffer, _ = ioutils.get_reader_filepath_or_buffer(
         path_or_data=filepath_or_buffer,
         compression=None,
         iotypes=(BytesIO, StringIO),
-        **kwargs,
+        storage_options=storage_options,
     )
 
     return cudf.Series._from_data(
         libtext.read_text(
-            filepath_or_buffer, delimiter=delimiter, byte_range=byte_range
+            filepath_or_buffer,
+            delimiter=delimiter,
+            byte_range=byte_range,
+            strip_delimiters=strip_delimiters,
+            compression=compression,
+            compression_offsets=compression_offsets,
         )
     )
diff --git a/python/cudf/cudf/options.py b/python/cudf/cudf/options.py
index 7f6a6f10e25..4a0a0437e00 100644
--- a/python/cudf/cudf/options.py
+++ b/python/cudf/cudf/options.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
+import os
 import textwrap
 from collections.abc import Container
 from dataclasses import dataclass
@@ -17,6 +18,26 @@ class Option:
 _OPTIONS: Dict[str, Option] = {}
 
 
+def _env_get_int(name, default):
+    try:
+        return int(os.getenv(name, default))
+    except (ValueError, TypeError):
+        return default
+
+
+def _env_get_bool(name, default):
+    env = os.getenv(name)
+    if env is None:
+        return default
+    as_a_int = _env_get_int(name, None)
+    env = env.lower().strip()
+    if env == "true" or env == "on" or as_a_int:
+        return True
+    if env == "false" or env == "off" or as_a_int == 0:
+        return False
+    return default
+
+
 def _register_option(
     name: str, default_value: Any, description: str, validator: Callable
 ):
@@ -129,6 +150,16 @@ def _validator(val):
     return _validator
 
 
+def _integer_and_none_validator(val):
+    try:
+        if val is None or int(val):
+            return
+    except ValueError:
+        raise ValueError(
+            f"{val} is not a valid option. " f"Must be an integer or None."
+        )
+
+
 _register_option(
     "default_integer_bitwidth",
     None,
@@ -163,3 +194,43 @@ def _validator(val):
     ),
     _make_contains_validator([None, 32, 64]),
 )
+
+
+_register_option(
+    "spill",
+    _env_get_bool("CUDF_SPILL", False),
+    textwrap.dedent(
+        """
+        Enables spilling.
+        \tValid values are True or False. Default is False.
+        """
+    ),
+    _make_contains_validator([False, True]),
+)
+
+_register_option(
+    "spill_on_demand",
+    _env_get_bool("CUDF_SPILL_ON_DEMAND", True),
+    textwrap.dedent(
+        """
+        Enables spilling on demand using an RMM out-of-memory error handler.
+        This has no effect if spilling is disabled, see the "spill" option.
+        \tValid values are True or False. Default is True.
+        """
+    ),
+    _make_contains_validator([False, True]),
+)
+
+_register_option(
+    "spill_device_limit",
+    _env_get_int("CUDF_SPILL_DEVICE_LIMIT", None),
+    textwrap.dedent(
+        """
+        Enforce a device memory limit in bytes.
+        This has no effect if spilling is disabled, see the "spill" option.
+        \tValid values are any positive integer or None (disabled).
+        \tDefault is None.
+        """
+    ),
+    _integer_and_none_validator,
+)
diff --git a/python/cudf/cudf/testing/_utils.py b/python/cudf/cudf/testing/_utils.py
index 259257c257f..5465462d7c2 100644
--- a/python/cudf/cudf/testing/_utils.py
+++ b/python/cudf/cudf/testing/_utils.py
@@ -352,8 +352,12 @@ def assert_column_memory_eq(
     children to the same constraints. Also fails check if the number of
     children mismatches at any level.
     """
-    assert lhs.base_data_ptr == rhs.base_data_ptr
-    assert lhs.base_mask_ptr == rhs.base_mask_ptr
+
+    def get_ptr(x) -> int:
+        return x.ptr if x else 0
+
+    assert get_ptr(lhs.base_data) == get_ptr(rhs.base_data)
+    assert get_ptr(lhs.base_mask) == get_ptr(rhs.base_mask)
     assert lhs.base_size == rhs.base_size
     assert lhs.offset == rhs.offset
     assert lhs.size == rhs.size
diff --git a/python/cudf/cudf/testing/dataset_generator.py b/python/cudf/cudf/testing/dataset_generator.py
index 4d24e7ff2a2..2867c4d10eb 100644
--- a/python/cudf/cudf/testing/dataset_generator.py
+++ b/python/cudf/cudf/testing/dataset_generator.py
@@ -502,7 +502,7 @@ def rand_dataframe(
                         cardinality=cardinality,
                         null_frequency=null_frequency,
                         generator=lambda cardinality=cardinality: [
-                            mimesis.random.random.schoice(
+                            mimesis.random.random.generate_string(
                                 string.printable,
                                 np.random.randint(
                                     low=0,
@@ -684,7 +684,7 @@ def get_values_for_nested_data(dtype, lists_max_length=None, size=None):
         values = float_generator(dtype=dtype, size=cardinality)()
     elif dtype.kind in ("U", "O"):
         values = [
-            mimesis.random.random.schoice(
+            mimesis.random.random.generate_string(
                 string.printable,
                 100,
             )
diff --git a/python/cudf/cudf/testing/testing.py b/python/cudf/cudf/testing/testing.py
index 070e4649c7b..a8428c2647b 100644
--- a/python/cudf/cudf/testing/testing.py
+++ b/python/cudf/cudf/testing/testing.py
@@ -134,7 +134,7 @@ def assert_column_equal(
         right Column to compare
     check_dtype : bool, default True
         Whether to check the Column dtype is identical.
-    check_column_type : bool or {‘equiv’}, default ‘equiv’
+    check_column_type : bool or {'equiv'}, default 'equiv'
         Whether to check the columns class, dtype and
         inferred_type are identical. Currently it is idle,
         and similar to pandas.
@@ -152,7 +152,7 @@ def assert_column_equal(
         Relative tolerance. Only used when `check_exact` is False.
     atol : float, default 1e-8
         Absolute tolerance. Only used when `check_exact` is False.
-    obj : str, default ‘ColumnBase’
+    obj : str, default 'ColumnBase'
         Specify object name being compared, internally used to
         show appropriate assertion message.
     """
@@ -322,9 +322,9 @@ def assert_index_equal(
         left Index to compare
     right : Index
         right Index to compare
-    exact : bool or {‘equiv’}, default ‘equiv’
+    exact : bool or {'equiv'}, default 'equiv'
         Whether to check the Index class, dtype and inferred_type
-        are identical. If ‘equiv’, then RangeIndex can be substituted
+        are identical. If 'equiv', then RangeIndex can be substituted
         for Int8Index, Int16Index, Int32Index, Int64Index as well.
     check_names : bool, default True
         Whether to check the names attribute.
@@ -345,7 +345,7 @@ def assert_index_equal(
         Relative tolerance. Only used when `check_exact` is False.
     atol : float, default 1e-8
         Absolute tolerance. Only used when `check_exact` is False.
-    obj : str, default ‘Index’
+    obj : str, default 'Index'
         Specify object name being compared, internally used to
         show appropriate assertion message.
 
@@ -467,7 +467,7 @@ def assert_series_equal(
         right Series to compare
     check_dtype : bool, default True
         Whether to check the Series dtype is identical.
-    check_index_type : bool or {‘equiv’}, default ‘equiv’
+    check_index_type : bool or {'equiv'}, default 'equiv'
         Whether to check the Index class, dtype and inferred_type
         are identical.
     check_series_type : bool, default True
@@ -491,7 +491,7 @@ def assert_series_equal(
         Relative tolerance. Only used when `check_exact` is False.
     atol : float, default 1e-8
         Absolute tolerance. Only used when `check_exact` is False.
-    obj : str, default ‘Series’
+    obj : str, default 'Series'
         Specify object name being compared, internally used to
         show appropriate assertion message.
 
@@ -600,7 +600,7 @@ def assert_frame_equal(
         right DataFrame to compare
     check_dtype : bool, default True
         Whether to check the DataFrame dtype is identical.
-    check_index_type : bool or {‘equiv’}, default ‘equiv’
+    check_index_type : bool or {'equiv'}, default 'equiv'
         Whether to check the Index class, dtype and inferred_type
         are identical.
     check_column_type : bool, default True
@@ -630,7 +630,7 @@ def assert_frame_equal(
         Relative tolerance. Only used when `check_exact` is False.
     atol : float, default 1e-8
         Absolute tolerance. Only used when `check_exact` is False.
-    obj : str, default ‘DataFrame’
+    obj : str, default 'DataFrame'
         Specify object name being compared, internally used to
         show appropriate assertion message.
 
diff --git a/python/cudf/cudf/tests/conftest.py b/python/cudf/cudf/tests/conftest.py
index 258b628305d..30d8f1c8422 100644
--- a/python/cudf/cudf/tests/conftest.py
+++ b/python/cudf/cudf/tests/conftest.py
@@ -158,3 +158,21 @@ def default_float_bitwidth(request):
     cudf.set_option("default_float_bitwidth", request.param)
     yield request.param
     cudf.set_option("default_float_bitwidth", old_default)
+
+
+@pytest.hookimpl(tryfirst=True, hookwrapper=True)
+def pytest_runtest_makereport(item, call):
+    """Hook to make result information available in fixtures
+
+    This makes it possible for a pytest.fixture to access the current test
+    state through `request.node.report`.
+    See the `manager` fixture in `test_spilling.py` for an example.
+
+    Pytest doc: <https://docs.pytest.org/en/latest/example/simple.html>
+    """
+    outcome = yield
+    rep = outcome.get_result()
+
+    # Set a report attribute for each phase of a call, which can
+    # be "setup", "call", "teardown"
+    setattr(item, "report", {rep.when: rep})
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.AllNulls.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.AllNulls.orc
new file mode 100644
index 00000000000..1c661e1c6f0
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.AllNulls.orc differ
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.EmptyListStripe.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.EmptyListStripe.orc
new file mode 100644
index 00000000000..edc1094a186
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.EmptyListStripe.orc differ
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.NullStructStripe.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.NullStructStripe.orc
new file mode 100644
index 00000000000..fe5f57af14c
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.NullStructStripe.orc differ
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyList.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyList.orc
new file mode 100644
index 00000000000..53c323436d6
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyList.orc differ
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyMap.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyMap.orc
new file mode 100644
index 00000000000..1bb4079c492
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneEmptyMap.orc differ
diff --git a/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneNullStruct.orc b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneNullStruct.orc
new file mode 100644
index 00000000000..a457b8285bd
Binary files /dev/null and b/python/cudf/cudf/tests/data/orc/TestOrcFile.Hive.OneNullStruct.orc differ
diff --git a/python/cudf/cudf/tests/data/text/chess.pgn.gz b/python/cudf/cudf/tests/data/text/chess.pgn.gz
new file mode 100644
index 00000000000..f03d0d0f73d
Binary files /dev/null and b/python/cudf/cudf/tests/data/text/chess.pgn.gz differ
diff --git a/python/cudf/cudf/tests/pytest.ini b/python/cudf/cudf/tests/pytest.ini
new file mode 100644
index 00000000000..7adbdb72d72
--- /dev/null
+++ b/python/cudf/cudf/tests/pytest.ini
@@ -0,0 +1,5 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+[pytest]
+markers =
+    spilling: mark benchmark a good candidate to run with `CUDF_SPILL=ON`
diff --git a/python/cudf/cudf/tests/test_binops.py b/python/cudf/cudf/tests/test_binops.py
index 2229bcc1938..5dfb962a4bb 100644
--- a/python/cudf/cudf/tests/test_binops.py
+++ b/python/cudf/cudf/tests/test_binops.py
@@ -14,6 +14,7 @@
 import cudf
 from cudf import Series
 from cudf.core._compat import PANDAS_GE_150
+from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.core.index import as_index
 from cudf.testing import _utils as utils
 from cudf.utils.dtypes import (
@@ -27,7 +28,6 @@
 
 STRING_TYPES = {"str"}
 
-
 _binops = [
     operator.add,
     operator.sub,
@@ -47,6 +47,131 @@
     operator.ge,
 ]
 
+_bitwise_binops = [operator.and_, operator.or_, operator.xor]
+
+_int_types = [
+    "int8",
+    "int16",
+    "int32",
+    "int64",
+    "uint8",
+    "uint16",
+    "uint32",
+]
+
+_cmpops = [
+    operator.lt,
+    operator.gt,
+    operator.le,
+    operator.ge,
+    operator.eq,
+    operator.ne,
+]
+
+_reflected_ops = [
+    lambda x: 1 + x,
+    lambda x: 2 * x,
+    lambda x: 2 - x,
+    lambda x: 2 // x,
+    lambda x: 2 / x,
+    lambda x: 3 + x,
+    lambda x: 3 * x,
+    lambda x: 3 - x,
+    lambda x: 3 // x,
+    lambda x: 3 / x,
+    lambda x: 3 % x,
+    lambda x: -1 + x,
+    lambda x: -2 * x,
+    lambda x: -2 - x,
+    lambda x: -2 // x,
+    lambda x: -2 / x,
+    lambda x: -3 + x,
+    lambda x: -3 * x,
+    lambda x: -3 - x,
+    lambda x: -3 // x,
+    lambda x: -3 / x,
+    lambda x: -3 % x,
+    lambda x: 0 + x,
+    lambda x: 0 * x,
+    lambda x: 0 - x,
+    lambda x: 0 // x,
+    lambda x: 0 / x,
+]
+
+_operators_arithmetic = [
+    "add",
+    "radd",
+    "sub",
+    "rsub",
+    "mul",
+    "rmul",
+    "mod",
+    "rmod",
+    "pow",
+    "rpow",
+    "div",
+    "divide",
+    "floordiv",
+    "rfloordiv",
+    "truediv",
+    "rtruediv",
+]
+
+_operators_comparison = ["eq", "ne", "lt", "le", "gt", "ge"]
+
+
+_cudf_scalar_reflected_ops = [
+    lambda x: cudf.Scalar(1) + x,
+    lambda x: cudf.Scalar(2) * x,
+    lambda x: cudf.Scalar(2) - x,
+    lambda x: cudf.Scalar(2) // x,
+    lambda x: cudf.Scalar(2) / x,
+    lambda x: cudf.Scalar(3) + x,
+    lambda x: cudf.Scalar(3) * x,
+    lambda x: cudf.Scalar(3) - x,
+    lambda x: cudf.Scalar(3) // x,
+    lambda x: cudf.Scalar(3) / x,
+    lambda x: cudf.Scalar(3) % x,
+    lambda x: cudf.Scalar(-1) + x,
+    lambda x: cudf.Scalar(-2) * x,
+    lambda x: cudf.Scalar(-2) - x,
+    lambda x: cudf.Scalar(-2) // x,
+    lambda x: cudf.Scalar(-2) / x,
+    lambda x: cudf.Scalar(-3) + x,
+    lambda x: cudf.Scalar(-3) * x,
+    lambda x: cudf.Scalar(-3) - x,
+    lambda x: cudf.Scalar(-3) // x,
+    lambda x: cudf.Scalar(-3) / x,
+    lambda x: cudf.Scalar(-3) % x,
+    lambda x: cudf.Scalar(0) + x,
+    lambda x: cudf.Scalar(0) * x,
+    lambda x: cudf.Scalar(0) - x,
+    lambda x: cudf.Scalar(0) // x,
+    lambda x: cudf.Scalar(0) / x,
+]
+
+pytest_xfail = pytest.mark.xfail
+pytestmark = pytest.mark.spilling
+
+# If spilling is enabled globally, we skip many test permutations
+# to reduce running time.
+if get_global_manager() is not None:
+    _binops = _binops[:1]
+    _binops_compare = _binops_compare[:1]
+    _int_types = _int_types[-1:]
+    _cmpops = _cmpops[:1]
+    _reflected_ops = _reflected_ops[:1]
+    _operators_arithmetic = _operators_arithmetic[:1]
+    _operators_comparison = _operators_comparison[:1]
+    _cudf_scalar_reflected_ops = _cudf_scalar_reflected_ops[:1]
+    DATETIME_TYPES = {"datetime64[ms]"}  # noqa: F811
+    NUMERIC_TYPES = {"float32"}  # noqa: F811
+    FLOAT_TYPES = {"float64"}  # noqa: F811
+    INTEGER_TYPES = {"int16"}  # noqa: F811
+    TIMEDELTA_TYPES = {"timedelta64[s]"}  # noqa: F811
+    # To save time, we skip tests marked "pytest.mark.xfail"
+    pytest_xfail = pytest.mark.skipif
+
 
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize("binop", _binops)
@@ -112,20 +237,6 @@ def test_series_binop_scalar(nelem, binop, obj_class, use_cudf_scalar):
     np.testing.assert_almost_equal(result.to_numpy(), binop(arr, rhs))
 
 
-_bitwise_binops = [operator.and_, operator.or_, operator.xor]
-
-
-_int_types = [
-    "int8",
-    "int16",
-    "int32",
-    "int64",
-    "uint8",
-    "uint16",
-    "uint32",
-]
-
-
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize("binop", _bitwise_binops)
 @pytest.mark.parametrize(
@@ -150,16 +261,6 @@ def test_series_bitwise_binop(binop, obj_class, lhs_dtype, rhs_dtype):
     np.testing.assert_almost_equal(result.to_numpy(), binop(arr1, arr2))
 
 
-_cmpops = [
-    operator.lt,
-    operator.gt,
-    operator.le,
-    operator.ge,
-    operator.eq,
-    operator.ne,
-]
-
-
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize("cmpop", _cmpops)
 @pytest.mark.parametrize(
@@ -383,37 +484,6 @@ def test_series_cmpop_mixed_dtype(cmpop, lhs_dtype, rhs_dtype, obj_class):
     np.testing.assert_array_equal(result.to_numpy(), cmpop(lhs, rhs))
 
 
-_reflected_ops = [
-    lambda x: 1 + x,
-    lambda x: 2 * x,
-    lambda x: 2 - x,
-    lambda x: 2 // x,
-    lambda x: 2 / x,
-    lambda x: 3 + x,
-    lambda x: 3 * x,
-    lambda x: 3 - x,
-    lambda x: 3 // x,
-    lambda x: 3 / x,
-    lambda x: 3 % x,
-    lambda x: -1 + x,
-    lambda x: -2 * x,
-    lambda x: -2 - x,
-    lambda x: -2 // x,
-    lambda x: -2 / x,
-    lambda x: -3 + x,
-    lambda x: -3 * x,
-    lambda x: -3 - x,
-    lambda x: -3 // x,
-    lambda x: -3 / x,
-    lambda x: -3 % x,
-    lambda x: 0 + x,
-    lambda x: 0 * x,
-    lambda x: 0 - x,
-    lambda x: 0 // x,
-    lambda x: 0 / x,
-]
-
-
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize(
     "func, dtype", list(product(_reflected_ops, utils.NUMERIC_TYPES))
@@ -456,37 +526,6 @@ def test_cudf_scalar_reflected_ops_scalar(func, dtype):
     assert np.isclose(expected, actual)
 
 
-_cudf_scalar_reflected_ops = [
-    lambda x: cudf.Scalar(1) + x,
-    lambda x: cudf.Scalar(2) * x,
-    lambda x: cudf.Scalar(2) - x,
-    lambda x: cudf.Scalar(2) // x,
-    lambda x: cudf.Scalar(2) / x,
-    lambda x: cudf.Scalar(3) + x,
-    lambda x: cudf.Scalar(3) * x,
-    lambda x: cudf.Scalar(3) - x,
-    lambda x: cudf.Scalar(3) // x,
-    lambda x: cudf.Scalar(3) / x,
-    lambda x: cudf.Scalar(3) % x,
-    lambda x: cudf.Scalar(-1) + x,
-    lambda x: cudf.Scalar(-2) * x,
-    lambda x: cudf.Scalar(-2) - x,
-    lambda x: cudf.Scalar(-2) // x,
-    lambda x: cudf.Scalar(-2) / x,
-    lambda x: cudf.Scalar(-3) + x,
-    lambda x: cudf.Scalar(-3) * x,
-    lambda x: cudf.Scalar(-3) - x,
-    lambda x: cudf.Scalar(-3) // x,
-    lambda x: cudf.Scalar(-3) / x,
-    lambda x: cudf.Scalar(-3) % x,
-    lambda x: cudf.Scalar(0) + x,
-    lambda x: cudf.Scalar(0) * x,
-    lambda x: cudf.Scalar(0) - x,
-    lambda x: cudf.Scalar(0) // x,
-    lambda x: cudf.Scalar(0) / x,
-]
-
-
 @pytest.mark.parametrize("obj_class", ["Series", "Index"])
 @pytest.mark.parametrize(
     "funcs, dtype",
@@ -650,28 +689,6 @@ def test_boolean_scalar_binop(op):
     utils.assert_eq(op(psr, False), op(gsr, cudf.Scalar(False)))
 
 
-_operators_arithmetic = [
-    "add",
-    "radd",
-    "sub",
-    "rsub",
-    "mul",
-    "rmul",
-    "mod",
-    "rmod",
-    "pow",
-    "rpow",
-    "div",
-    "divide",
-    "floordiv",
-    "rfloordiv",
-    "truediv",
-    "rtruediv",
-]
-
-_operators_comparison = ["eq", "ne", "lt", "le", "gt", "ge"]
-
-
 @pytest.mark.parametrize("func", _operators_arithmetic)
 @pytest.mark.parametrize("has_nulls", [True, False])
 @pytest.mark.parametrize("fill_value", [None, 27])
@@ -877,6 +894,67 @@ def test_binop_bool_uint(func, rhs):
     )
 
 
+@pytest.mark.parametrize(
+    "series_dtype", (np.bool_, np.int8, np.uint8, np.int64, np.uint64)
+)
+@pytest.mark.parametrize(
+    "divisor_dtype",
+    (
+        pytest.param(
+            np.bool_,
+            marks=pytest_xfail(
+                reason=(
+                    "Pandas handling of division by zero-bool is too strange"
+                )
+            ),
+        ),
+        np.int8,
+        np.uint8,
+        np.int64,
+        np.uint64,
+    ),
+)
+@pytest.mark.parametrize("scalar_divisor", [False, True])
+def test_floordiv_zero_float64(series_dtype, divisor_dtype, scalar_divisor):
+    sr = pd.Series([1, 2, 3], dtype=series_dtype)
+    cr = cudf.from_pandas(sr)
+
+    if scalar_divisor:
+        pd_div = divisor_dtype(0)
+        cudf_div = cudf.Scalar(0, dtype=divisor_dtype)
+    else:
+        pd_div = pd.Series([0], dtype=divisor_dtype)
+        cudf_div = cudf.from_pandas(pd_div)
+    utils.assert_eq((sr // pd_div), (cr // cudf_div))
+
+
+@pytest.mark.parametrize(
+    "dtype",
+    (
+        pytest.param(
+            np.bool_,
+            marks=pytest_xfail(
+                reason=(
+                    "Pandas handling of division by zero-bool is too strange"
+                )
+            ),
+        ),
+        np.int8,
+        np.uint8,
+        np.int64,
+        np.uint64,
+        np.float32,
+        np.float64,
+    ),
+)
+def test_rmod_zero_nan(dtype):
+    sr = pd.Series([1, 1, 0], dtype=dtype)
+    cr = cudf.from_pandas(sr)
+    utils.assert_eq(1 % sr, 1 % cr)
+    expected_dtype = np.float64 if cr.dtype.kind != "f" else dtype
+    utils.assert_eq(1 % cr, cudf.Series([0, 0, None], dtype=expected_dtype))
+
+
 def test_series_misc_binop():
     pds = pd.Series([1, 2, 4], name="abc xyz")
     gds = cudf.Series([1, 2, 4], name="abc xyz")
@@ -1561,7 +1639,7 @@ def test_scalar_null_binops(op, dtype_l, dtype_r):
         "microseconds",
         pytest.param(
             "nanoseconds",
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 condition=not PANDAS_GE_150,
                 reason="https://github.com/pandas-dev/pandas/issues/36589",
             ),
@@ -1613,19 +1691,19 @@ def test_datetime_dateoffset_binaryop(
         {"months": 2, "years": 5, "seconds": 923, "microseconds": 481},
         pytest.param(
             {"milliseconds": 4},
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="Pandas gets the wrong answer for milliseconds"
             ),
         ),
         pytest.param(
             {"milliseconds": 4, "years": 2},
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="Pandas construction fails with these keywords"
             ),
         ),
         pytest.param(
             {"nanoseconds": 12},
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="Pandas gets the wrong answer for nanoseconds"
             ),
         ),
@@ -1669,7 +1747,7 @@ def test_datetime_dateoffset_binaryop_multiple(date_col, kwargs, op):
         "microseconds",
         pytest.param(
             "nanoseconds",
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 condition=not PANDAS_GE_150,
                 reason="https://github.com/pandas-dev/pandas/issues/36589",
             ),
@@ -1775,7 +1853,7 @@ def test_binops_with_NA_consistent(dtype, op):
 
 
 @pytest.mark.parametrize(
-    "args",
+    "op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype",
     [
         (
             operator.add,
@@ -1786,6 +1864,15 @@ def test_binops_with_NA_consistent(dtype, op):
             ["3.0", "4.0"],
             cudf.Decimal64Dtype(scale=2, precision=4),
         ),
+        (
+            operator.add,
+            2,
+            cudf.Decimal64Dtype(scale=2, precision=3),
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=3),
+            ["3.5", "4.0"],
+            cudf.Decimal64Dtype(scale=2, precision=4),
+        ),
         (
             operator.add,
             ["1.5", "2.0"],
@@ -1831,6 +1918,15 @@ def test_binops_with_NA_consistent(dtype, op):
             ["99.9", "199.8"],
             cudf.Decimal128Dtype(scale=6, precision=19),
         ),
+        (
+            operator.sub,
+            2,
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["2.25", "1.005"],
+            cudf.Decimal64Dtype(scale=3, precision=4),
+            ["-0.25", "0.995"],
+            cudf.Decimal64Dtype(scale=3, precision=5),
+        ),
         (
             operator.mul,
             ["1.5", "2.0"],
@@ -1858,6 +1954,15 @@ def test_binops_with_NA_consistent(dtype, op):
             ["343.0", "1000.0"],
             cudf.Decimal64Dtype(scale=0, precision=8),
         ),
+        (
+            operator.mul,
+            200,
+            cudf.Decimal64Dtype(scale=3, precision=6),
+            ["0.343", "0.500"],
+            cudf.Decimal64Dtype(scale=3, precision=6),
+            ["68.60", "100.0"],
+            cudf.Decimal64Dtype(scale=6, precision=13),
+        ),
         (
             operator.truediv,
             ["1.5", "2.0"],
@@ -1885,6 +1990,15 @@ def test_binops_with_NA_consistent(dtype, op):
             ["56.77", "1.79"],
             cudf.Decimal128Dtype(scale=13, precision=25),
         ),
+        (
+            operator.truediv,
+            20,
+            cudf.Decimal128Dtype(scale=2, precision=6),
+            ["20", "20"],
+            cudf.Decimal128Dtype(scale=2, precision=6),
+            ["1.0", "1.0"],
+            cudf.Decimal128Dtype(scale=9, precision=15),
+        ),
         (
             operator.add,
             ["1.5", None, "2.0"],
@@ -2103,10 +2217,12 @@ def test_binops_with_NA_consistent(dtype, op):
         ),
     ],
 )
-def test_binops_decimal(args):
-    op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype = args
+def test_binops_decimal(op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype):
 
-    a = utils._decimal_series(lhs, l_dtype)
+    if isinstance(lhs, (int, float)):
+        a = cudf.Scalar(lhs, l_dtype)
+    else:
+        a = utils._decimal_series(lhs, l_dtype)
     b = utils._decimal_series(rhs, r_dtype)
     expect = (
         utils._decimal_series(expect, expect_dtype)
@@ -2122,6 +2238,68 @@ def test_binops_decimal(args):
     utils.assert_eq(expect, got)
 
 
+@pytest.mark.parametrize(
+    "op,lhs,l_dtype,rhs,r_dtype,expect,expect_dtype",
+    [
+        (
+            "radd",
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=3),
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=2, precision=3),
+            ["3.0", "4.0"],
+            cudf.Decimal64Dtype(scale=2, precision=4),
+        ),
+        (
+            "rsub",
+            ["100", "200"],
+            cudf.Decimal64Dtype(scale=-2, precision=10),
+            ["0.1", "0.2"],
+            cudf.Decimal64Dtype(scale=6, precision=10),
+            ["-99.9", "-199.8"],
+            cudf.Decimal128Dtype(scale=6, precision=19),
+        ),
+        (
+            "rmul",
+            ["1000", "2000"],
+            cudf.Decimal64Dtype(scale=-3, precision=4),
+            ["0.343", "0.500"],
+            cudf.Decimal64Dtype(scale=3, precision=3),
+            ["343.0", "1000.0"],
+            cudf.Decimal64Dtype(scale=0, precision=8),
+        ),
+        (
+            "rtruediv",
+            ["1.5", "0.5"],
+            cudf.Decimal64Dtype(scale=3, precision=6),
+            ["1.5", "2.0"],
+            cudf.Decimal64Dtype(scale=3, precision=6),
+            ["1.0", "4.0"],
+            cudf.Decimal64Dtype(scale=10, precision=16),
+        ),
+    ],
+)
+def test_binops_reflect_decimal(
+    op, lhs, l_dtype, rhs, r_dtype, expect, expect_dtype
+):
+
+    a = utils._decimal_series(lhs, l_dtype)
+    b = utils._decimal_series(rhs, r_dtype)
+    expect = utils._decimal_series(expect, expect_dtype)
+
+    got = getattr(a, op)(b)
+    assert expect.dtype == got.dtype
+    utils.assert_eq(expect, got)
+
+
+def test_binops_raise_error():
+    s = cudf.Series([decimal.Decimal("1.324324")])
+    with pytest.raises(TypeError):
+        s**1
+    with pytest.raises(TypeError):
+        s // 1
+
+
 @pytest.mark.parametrize(
     "args",
     [
@@ -2591,7 +2769,7 @@ def test_binops_decimal_comp_mixed_integer(args, integer_dtype, reflected):
         ),
     ],
 )
-@pytest.mark.xfail(
+@pytest_xfail(
     reason="binop operations not supported for different "
     "bit-width decimal types"
 )
@@ -2765,7 +2943,7 @@ def decimal_series(input, dtype):
     ],
 )
 @pytest.mark.parametrize("reflected", [True, False])
-@pytest.mark.xfail(
+@pytest_xfail(
     reason="binop operations not supported for different bit-width "
     "decimal types"
 )
@@ -2954,7 +3132,7 @@ def test_empty_column(binop, data, scalar):
         cudf.DataFrame([[1, 2, 3, 4], [5, 6, 7, 8]]),
         pytest.param(
             cudf.DataFrame([[1, None, None, 4], [5, 6, 7, None]]),
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="Cannot access Frame.values if frame contains nulls"
             ),
         ),
diff --git a/python/cudf/cudf/tests/test_buffer.py b/python/cudf/cudf/tests/test_buffer.py
index eaa615a2839..df7152d53a6 100644
--- a/python/cudf/cudf/tests/test_buffer.py
+++ b/python/cudf/cudf/tests/test_buffer.py
@@ -1,10 +1,11 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
-from typing import Callable
 
 import cupy as cp
 import pytest
 
-from cudf.core.buffer import Buffer, DeviceBufferLike, as_device_buffer_like
+from cudf.core.buffer import Buffer, as_buffer
+
+pytestmark = pytest.mark.spilling
 
 arr_len = 10
 
@@ -23,10 +24,10 @@
 def test_buffer_from_cuda_iface_contiguous(data):
     data, expect_success = data
     if expect_success:
-        as_device_buffer_like(data.view("|u1"))
+        as_buffer(data.view("|u1"))
     else:
         with pytest.raises(ValueError):
-            as_device_buffer_like(data.view("|u1"))
+            as_buffer(data.view("|u1"))
 
 
 @pytest.mark.parametrize(
@@ -41,24 +42,29 @@ def test_buffer_from_cuda_iface_contiguous(data):
 @pytest.mark.parametrize("dtype", ["uint8", "int8", "float32", "int32"])
 def test_buffer_from_cuda_iface_dtype(data, dtype):
     data = data.astype(dtype)
-    buf = as_device_buffer_like(data)
+    buf = as_buffer(data)
     got = cp.array(buf).reshape(-1).view("uint8")
     expect = data.reshape(-1).view("uint8")
     assert (expect == got).all()
 
 
-@pytest.mark.parametrize("creator", [Buffer, as_device_buffer_like])
-def test_buffer_creation_from_any(creator: Callable[[object], Buffer]):
+def test_buffer_creation_from_any():
     ary = cp.arange(arr_len)
-    b = creator(ary)
-    assert isinstance(b, DeviceBufferLike)
-    assert ary.__cuda_array_interface__["data"][0] == b.ptr
+    b = as_buffer(ary, exposed=True)
+    assert isinstance(b, Buffer)
+    assert ary.data.ptr == b.ptr
     assert ary.nbytes == b.size
 
     with pytest.raises(
         ValueError, match="size must be specified when `data` is an integer"
     ):
-        Buffer(42)
+        as_buffer(ary.data.ptr)
+
+    b = as_buffer(ary.data.ptr, size=ary.nbytes, owner=ary, exposed=True)
+    assert isinstance(b, Buffer)
+    assert ary.data.ptr == b.ptr
+    assert ary.nbytes == b.size
+    assert b.owner.owner is ary
 
 
 @pytest.mark.parametrize(
@@ -66,7 +72,7 @@ def test_buffer_creation_from_any(creator: Callable[[object], Buffer]):
 )
 def test_buffer_repr(size, expect):
     ary = cp.arange(size, dtype="uint8")
-    buf = as_device_buffer_like(ary)
+    buf = as_buffer(ary)
     assert f"size={expect}" in repr(buf)
 
 
@@ -83,25 +89,25 @@ def test_buffer_repr(size, expect):
 )
 def test_buffer_slice(idx):
     ary = cp.arange(arr_len, dtype="uint8")
-    buf = as_device_buffer_like(ary)
+    buf = as_buffer(ary)
     expect = ary[idx]
     got = cp.array(buf[idx])
     assert (expect == got).all()
 
 
 @pytest.mark.parametrize(
-    "idx, err_msg",
+    "idx, err_type, err_msg",
     [
-        (1, "index must be an slice"),
-        (slice(3, 2), "size cannot be negative"),
-        (slice(1, 2, 2), "slice must be contiguous"),
-        (slice(1, 2, -1), "slice must be contiguous"),
-        (slice(3, 2, -1), "slice must be contiguous"),
+        (1, TypeError, "Argument 'key' has incorrect type"),
+        (slice(3, 2), ValueError, "size cannot be negative"),
+        (slice(1, 2, 2), ValueError, "slice must be C-contiguous"),
+        (slice(1, 2, -1), ValueError, "slice must be C-contiguous"),
+        (slice(3, 2, -1), ValueError, "slice must be C-contiguous"),
     ],
 )
-def test_buffer_slice_fail(idx, err_msg):
+def test_buffer_slice_fail(idx, err_type, err_msg):
     ary = cp.arange(arr_len, dtype="uint8")
-    buf = as_device_buffer_like(ary)
+    buf = as_buffer(ary)
 
-    with pytest.raises(ValueError, match=err_msg):
+    with pytest.raises(err_type, match=err_msg):
         buf[idx]
diff --git a/python/cudf/cudf/tests/test_column.py b/python/cudf/cudf/tests/test_column.py
index 4e2a26d31bd..467c88b200f 100644
--- a/python/cudf/cudf/tests/test_column.py
+++ b/python/cudf/cudf/tests/test_column.py
@@ -406,7 +406,7 @@ def test_column_view_string_slice(slc):
 )
 def test_as_column_buffer(data, expected):
     actual_column = cudf.core.column.as_column(
-        cudf.core.buffer.as_device_buffer_like(data), dtype=data.dtype
+        cudf.core.buffer.as_buffer(data), dtype=data.dtype
     )
     assert_eq(cudf.Series(actual_column), cudf.Series(expected))
 
diff --git a/python/cudf/cudf/tests/test_csv.py b/python/cudf/cudf/tests/test_csv.py
index b91893d8991..7e62f63b0e2 100644
--- a/python/cudf/cudf/tests/test_csv.py
+++ b/python/cudf/cudf/tests/test_csv.py
@@ -766,6 +766,81 @@ def test_csv_reader_bools(tmpdir, names, dtypes, data, trues, falses):
     assert_eq(df_out, out)
 
 
+def test_csv_reader_bools_custom():
+    names = ["text", "bool"]
+    dtypes = {"text": "str", "bool": "bool"}
+    trues = ["foo", "1"]
+    falses = ["bar", "0"]
+    lines = [
+        ",".join(names),
+        "true,true",
+        "false,false",
+        "foo,foo",
+        "bar,bar",
+        "0,0",
+        "1,1",
+    ]
+    buffer = "\n".join(lines)
+
+    df = read_csv(
+        StringIO(buffer),
+        names=names,
+        dtype=dtypes,
+        skiprows=1,
+        true_values=trues,
+        false_values=falses,
+    )
+
+    # Note: bool literals give parsing errors as int
+    # "0" and "1" give parsing errors as bool in pandas
+    expected = pd.read_csv(
+        StringIO(buffer),
+        names=names,
+        dtype=dtypes,
+        skiprows=1,
+        true_values=trues,
+        false_values=falses,
+    )
+    assert_eq(df, expected, check_dtype=True)
+
+
+def test_csv_reader_bools_NA():
+    names = ["text", "int"]
+    dtypes = ["str", "int"]
+    trues = ["foo"]
+    falses = ["bar"]
+    lines = [
+        ",".join(names),
+        "true,true",
+        "false,false",
+        "foo,foo",
+        "bar,bar",
+        "qux,qux",
+    ]
+
+    buffer = "\n".join(lines)
+
+    df = read_csv(
+        StringIO(buffer),
+        names=names,
+        dtype=dtypes,
+        skiprows=1,
+        true_values=trues,
+        false_values=falses,
+    )
+    assert len(df.columns) == 2
+    assert df["text"].dtype == np.dtype("object")
+    assert df["int"].dtype == np.dtype("int64")
+    expected = pd.DataFrame(
+        {
+            "text": ["true", "false", "foo", "bar", "qux"],
+            "int": [1, 0, 1, 0, 0],
+        }
+    )
+    # breaking behaviour is np.nan for qux
+    assert_eq(df, expected)
+
+
 def test_csv_quotednumbers(tmpdir):
     fname = tmpdir.mkdir("gdf_csv").join("tmp_csvreader_file12.csv")
 
@@ -2130,3 +2205,41 @@ def test_default_float_bitwidth_partial(default_float_bitwidth):
     )
     assert read["float1"].dtype == np.dtype(f"f{default_float_bitwidth//8}")
     assert read["float2"].dtype == np.dtype("f8")
+
+
+@pytest.mark.parametrize(
+    "usecols,names",
+    [
+        # selection using indices; only names of selected columns are specified
+        ([1, 2], ["b", "c"]),
+        # selection using indices; names of all columns are specified
+        ([1, 2], ["a", "b", "c"]),
+        # selection using indices; duplicates
+        ([2, 2], ["a", "b", "c"]),
+        # selection using indices; out of order
+        ([2, 1], ["a", "b", "c"]),
+        # selection using names
+        (["b"], ["a", "b", "c"]),
+        # selection using names; multiple columns
+        (["b", "c"], ["a", "b", "c"]),
+        # selection using names; duplicates
+        (["c", "c"], ["a", "b", "c"]),
+        # selection using names; out of order
+        (["c", "b"], ["a", "b", "c"]),
+    ],
+)
+def test_column_selection_plus_column_names(usecols, names):
+
+    lines = [
+        "num,datetime,text",
+        "123,2018-11-13T12:00:00,abc",
+        "456,2018-11-14T12:35:01,def",
+        "789,2018-11-15T18:02:59,ghi",
+    ]
+
+    buffer = "\n".join(lines) + "\n"
+
+    assert_eq(
+        pd.read_csv(StringIO(buffer), usecols=usecols, names=names),
+        cudf.read_csv(StringIO(buffer), usecols=usecols, names=names),
+    )
diff --git a/python/cudf/cudf/tests/test_cuda_array_interface.py b/python/cudf/cudf/tests/test_cuda_array_interface.py
index 2a62a289747..e81f4ec795a 100644
--- a/python/cudf/cudf/tests/test_cuda_array_interface.py
+++ b/python/cudf/cudf/tests/test_cuda_array_interface.py
@@ -10,6 +10,7 @@
 from numba import cuda
 
 import cudf
+from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing._utils import DATETIME_TYPES, NUMERIC_TYPES, assert_eq
 
 
@@ -169,6 +170,13 @@ def test_column_from_ephemeral_cupy_try_lose_reference():
     assert_eq(pd.Series([1, 2, 3]), a.to_pandas())
 
 
+@pytest.mark.xfail(
+    get_global_manager() is not None,
+    reason=(
+        "spilling doesn't support PyTorch, see "
+        "`cudf.core.buffer.spillable_buffer.DelayedPointerTuple`"
+    ),
+)
 def test_cuda_array_interface_pytorch():
     torch = pytest.importorskip("torch", minversion="1.6.0")
     if not torch.cuda.is_available():
@@ -179,9 +187,7 @@ def test_cuda_array_interface_pytorch():
     got = cudf.Series(tensor)
 
     assert_eq(got, series)
-    buffer = cudf.core.buffer.as_device_buffer_like(
-        cupy.ones(10, dtype=np.bool_)
-    )
+    buffer = cudf.core.buffer.as_buffer(cupy.ones(10, dtype=np.bool_))
     tensor = torch.tensor(buffer)
     got = cudf.Series(tensor, dtype=np.bool_)
 
diff --git a/python/cudf/cudf/tests/test_dataframe.py b/python/cudf/cudf/tests/test_dataframe.py
index d9e9a4dbba1..f6716ece95b 100644
--- a/python/cudf/cudf/tests/test_dataframe.py
+++ b/python/cudf/cudf/tests/test_dataframe.py
@@ -9,6 +9,7 @@
 import string
 import textwrap
 import warnings
+from collections import OrderedDict, defaultdict
 from contextlib import contextmanager
 from copy import copy
 
@@ -27,6 +28,7 @@
     PANDAS_GE_134,
     PANDAS_LT_140,
 )
+from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.core.column import column
 from cudf.testing import _utils as utils
 from cudf.testing._utils import (
@@ -39,6 +41,23 @@
     gen_rand,
 )
 
+pytest_xfail = pytest.mark.xfail
+pytestmark = pytest.mark.spilling
+
+# Use this to "unmark" the module level spilling mark
+pytest_unmark_spilling = pytest.mark.skipif(
+    get_global_manager() is not None, reason="unmarked spilling"
+)
+
+# If spilling is enabled globally, we skip many test permutations
+# to reduce running time.
+if get_global_manager() is not None:
+    ALL_TYPES = ["float32"]  # noqa: F811
+    DATETIME_TYPES = ["datetime64[ms]"]  # noqa: F811
+    NUMERIC_TYPES = ["float32"]  # noqa: F811
+    # To save time, we skip tests marked "xfail"
+    pytest_xfail = pytest.mark.skipif
+
 
 def test_init_via_list_of_tuples():
     data = [
@@ -260,19 +279,19 @@ def test_append_index(a, b):
         {1: ["a", np.nan, "c"], 2: ["q", None, "u"]},
         pytest.param(
             {},
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="https://github.com/rapidsai/cudf/issues/11080"
             ),
         ),
         pytest.param(
             {1: [], 2: [], 3: []},
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="https://github.com/rapidsai/cudf/issues/11080"
             ),
         ),
         pytest.param(
             [1, 2, 3],
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="https://github.com/rapidsai/cudf/issues/11080"
             ),
         ),
@@ -289,6 +308,62 @@ def test_axes(data):
         assert_eq(e, a)
 
 
+def test_dataframe_truncate_axis_0():
+    df = cudf.DataFrame(
+        {
+            "A": ["a", "b", "c", "d", "e"],
+            "B": ["f", "g", "h", "i", "j"],
+            "C": ["k", "l", "m", "n", "o"],
+        },
+        index=[1, 2, 3, 4, 5],
+    )
+    pdf = df.to_pandas()
+
+    expected = pdf.truncate(before=2, after=4, axis="index")
+    actual = df.truncate(before=2, after=4, axis="index")
+    assert_eq(actual, expected)
+
+    expected = pdf.truncate(before=1, after=4, axis=0)
+    actual = df.truncate(before=1, after=4, axis=0)
+    assert_eq(expected, actual)
+
+
+def test_dataframe_truncate_axis_1():
+    df = cudf.DataFrame(
+        {
+            "A": ["a", "b", "c", "d", "e"],
+            "B": ["f", "g", "h", "i", "j"],
+            "C": ["k", "l", "m", "n", "o"],
+        },
+        index=[1, 2, 3, 4, 5],
+    )
+    pdf = df.to_pandas()
+
+    expected = pdf.truncate(before="A", after="B", axis="columns")
+    actual = df.truncate(before="A", after="B", axis="columns")
+    assert_eq(actual, expected)
+
+    expected = pdf.truncate(before="A", after="B", axis=1)
+    actual = df.truncate(before="A", after="B", axis=1)
+    assert_eq(actual, expected)
+
+
+def test_dataframe_truncate_datetimeindex():
+    dates = cudf.date_range(
+        "2021-01-01 23:45:00", "2021-01-01 23:46:00", freq="s"
+    )
+    df = cudf.DataFrame(data={"A": 1, "B": 2}, index=dates)
+    pdf = df.to_pandas()
+    expected = pdf.truncate(
+        before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+    )
+    actual = df.truncate(
+        before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+    )
+
+    assert_eq(actual, expected)
+
+
 def test_series_init_none():
 
     # test for creating empty series
@@ -2008,6 +2083,7 @@ def gdf(pdf):
     return cudf.DataFrame.from_pandas(pdf)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "data",
     [
@@ -2024,7 +2100,7 @@ def gdf(pdf):
         },
         pytest.param(
             {"x": [], "y": [], "z": []},
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 condition=version.parse("11")
                 <= version.parse(cupy.__version__)
                 < version.parse("11.1"),
@@ -2034,7 +2110,7 @@ def gdf(pdf):
         ),
         pytest.param(
             {"x": []},
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 condition=version.parse("11")
                 <= version.parse(cupy.__version__)
                 < version.parse("11.1"),
@@ -2155,6 +2231,7 @@ def _hide_host_other_warning(other):
         yield
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "binop",
     [
@@ -2245,6 +2322,7 @@ def test_bitwise_binops_df(pdf, gdf, binop):
     assert_eq(d, g)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "binop",
     [
@@ -2550,7 +2628,7 @@ def test_dataframe_boolmask(mask_shape):
         [True, False, True],
         pytest.param(
             cudf.Series([True, False, True]),
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="Pandas can't index a multiindex with a Series"
             ),
         ),
@@ -2702,6 +2780,7 @@ def test_tail_for_string():
     assert_eq(gdf.tail(3), gdf.to_pandas().tail(3))
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize("level", [None, 0, "l0", 1, ["l0", 1]])
 @pytest.mark.parametrize("drop", [True, False])
 @pytest.mark.parametrize(
@@ -2745,6 +2824,7 @@ def test_reset_index(level, drop, column_names, inplace, col_level, col_fill):
     assert_eq(expect, got)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize("level", [None, 0, 1, [None]])
 @pytest.mark.parametrize("drop", [False, True])
 @pytest.mark.parametrize("inplace", [False, True])
@@ -2905,7 +2985,7 @@ def test_set_index(data, index, drop, append, inplace):
 )
 @pytest.mark.parametrize("index", ["a", pd.Index([1, 1, 2, 2, 3])])
 @pytest.mark.parametrize("verify_integrity", [True])
-@pytest.mark.xfail
+@pytest_xfail
 def test_set_index_verify_integrity(data, index, verify_integrity):
     gdf = cudf.DataFrame(data)
     gdf.set_index(index, verify_integrity=verify_integrity)
@@ -2963,6 +3043,7 @@ def reindex_data_numeric():
     )
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize("copy", [True, False])
 @pytest.mark.parametrize(
     "args,gd_kwargs",
@@ -3119,6 +3200,7 @@ def test_dataframe_empty_sort_index():
     assert_eq(expect, got, check_index_type=True)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "index",
     [
@@ -3133,7 +3215,7 @@ def test_dataframe_empty_sort_index():
         pytest.param(
             pd.RangeIndex(2, -1, -1),
             marks=[
-                pytest.mark.xfail(
+                pytest_xfail(
                     condition=PANDAS_LT_140,
                     reason="https://github.com/pandas-dev/pandas/issues/43591",
                 )
@@ -3176,6 +3258,7 @@ def test_dataframe_sort_index(
         assert_eq(expected, got, check_index_type=True)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize("axis", [0, 1, "index", "columns"])
 @pytest.mark.parametrize(
     "level",
@@ -3713,7 +3796,7 @@ def test_dataframe_round(decimals):
         pytest.param(
             [["a", True], ["b", False], ["c", False]],
             marks=[
-                pytest.mark.xfail(
+                pytest_xfail(
                     reason="NotImplementedError: all does not "
                     "support columns of object dtype."
                 )
@@ -3765,7 +3848,7 @@ def test_all(data):
         pytest.param(
             [["a", True], ["b", False], ["c", False]],
             marks=[
-                pytest.mark.xfail(
+                pytest_xfail(
                     reason="NotImplementedError: any does not "
                     "support columns of object dtype."
                 )
@@ -3813,6 +3896,7 @@ def test_empty_dataframe_any(axis):
     assert_eq(got, expected, check_index_type=False)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize("a", [[], ["123"]])
 @pytest.mark.parametrize("b", ["123", ["123"]])
 @pytest.mark.parametrize(
@@ -4260,11 +4344,11 @@ def test_series_values_host_property(data):
         [5.0, 7.0, 8.0],
         pytest.param(
             pd.Categorical(["a", "b", "c"]),
-            marks=pytest.mark.xfail(raises=NotImplementedError),
+            marks=pytest_xfail(raises=NotImplementedError),
         ),
         pytest.param(
             ["m", "a", "d", "v"],
-            marks=pytest.mark.xfail(raises=TypeError),
+            marks=pytest_xfail(raises=TypeError),
         ),
     ],
 )
@@ -4285,27 +4369,27 @@ def test_series_values_property(data):
         {"A": np.float32(np.arange(3)), "B": np.float64(np.arange(3))},
         pytest.param(
             {"A": [1, None, 3], "B": [1, 2, None]},
-            marks=pytest.mark.xfail(
-                reason="Nulls not supported by as_gpu_matrix"
+            marks=pytest_xfail(
+                reason="Nulls not supported by values accessor"
             ),
         ),
         pytest.param(
             {"A": [None, None, None], "B": [None, None, None]},
-            marks=pytest.mark.xfail(
-                reason="Nulls not supported by as_gpu_matrix"
+            marks=pytest_xfail(
+                reason="Nulls not supported by values accessor"
             ),
         ),
         {"A": [], "B": []},
         pytest.param(
             {"A": [1, 2, 3], "B": ["a", "b", "c"]},
-            marks=pytest.mark.xfail(
-                reason="str or categorical not supported by as_gpu_matrix"
+            marks=pytest_xfail(
+                reason="str or categorical not supported by values accessor"
             ),
         ),
         pytest.param(
             {"A": pd.Categorical(["a", "b", "c"]), "B": ["d", "e", "f"]},
-            marks=pytest.mark.xfail(
-                reason="str or categorical not supported by as_gpu_matrix"
+            marks=pytest_xfail(
+                reason="str or categorical not supported by values accessor"
             ),
         ),
     ],
@@ -4640,9 +4724,9 @@ def test_empty_df_astype(dtype, args):
     "errors",
     [
         pytest.param(
-            "raise", marks=pytest.mark.xfail(reason="should raise error here")
+            "raise", marks=pytest_xfail(reason="should raise error here")
         ),
-        pytest.param("other", marks=pytest.mark.xfail(raises=ValueError)),
+        pytest.param("other", marks=pytest_xfail(raises=ValueError)),
         "ignore",
     ],
 )
@@ -4675,6 +4759,7 @@ def test_df_constructor_dtype(dtype):
     assert_eq(expect, got)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "data",
     [
@@ -5129,7 +5214,7 @@ def test_cov():
     assert_eq(pdf.cov(), gdf.cov())
 
 
-@pytest.mark.xfail(reason="cupy-based cov does not support nulls")
+@pytest_xfail(reason="cupy-based cov does not support nulls")
 def test_cov_nans():
     pdf = pd.DataFrame()
     pdf["a"] = [None, None, None, 2.00758632, None]
@@ -5141,6 +5226,7 @@ def test_cov_nans():
     assert_eq(pdf.cov(), gdf.cov())
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "gsr",
     [
@@ -5151,7 +5237,7 @@ def test_cov_nans():
         cudf.Series([4, 2, 3], index=cudf.core.index.RangeIndex(0, 3)),
         pytest.param(
             cudf.Series([4, 2, 3, 4, 5], index=["a", "b", "d", "0", "12"]),
-            marks=pytest.mark.xfail,
+            marks=pytest_xfail,
         ),
     ],
 )
@@ -5193,6 +5279,7 @@ def test_df_sr_binop(gsr, colnames, op):
     assert_eq(expect, got, check_dtype=False)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "op",
     [
@@ -5204,12 +5291,12 @@ def test_df_sr_binop(gsr, colnames, op):
         operator.pow,
         # comparison ops will temporarily XFAIL
         # see PR  https://github.com/rapidsai/cudf/pull/7491
-        pytest.param(operator.eq, marks=pytest.mark.xfail()),
-        pytest.param(operator.lt, marks=pytest.mark.xfail()),
-        pytest.param(operator.le, marks=pytest.mark.xfail()),
-        pytest.param(operator.gt, marks=pytest.mark.xfail()),
-        pytest.param(operator.ge, marks=pytest.mark.xfail()),
-        pytest.param(operator.ne, marks=pytest.mark.xfail()),
+        pytest.param(operator.eq, marks=pytest_xfail()),
+        pytest.param(operator.lt, marks=pytest_xfail()),
+        pytest.param(operator.le, marks=pytest_xfail()),
+        pytest.param(operator.gt, marks=pytest_xfail()),
+        pytest.param(operator.ge, marks=pytest_xfail()),
+        pytest.param(operator.ne, marks=pytest_xfail()),
     ],
 )
 @pytest.mark.parametrize(
@@ -5271,7 +5358,7 @@ def test_memory_usage(deep, index, set_index):
         )
 
 
-@pytest.mark.xfail
+@pytest_xfail
 def test_memory_usage_string():
     rows = int(100)
     df = pd.DataFrame(
@@ -6190,6 +6277,7 @@ def test_dataframe_init_from_arrays_cols(data, cols, index):
         assert_eq(pdf, gdf, check_dtype=False)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "col_data",
     [
@@ -6233,6 +6321,7 @@ def test_dataframe_assign_scalar(col_data, assign_val):
     assert_eq(pdf, gdf)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "col_data",
     [
@@ -6528,6 +6617,7 @@ def test_dataframe_info_null_counts():
     assert str_cmp == actual_string
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "data1",
     [
@@ -6762,27 +6852,172 @@ def test_cudf_isclose_different_index():
     assert_eq(expected, cudf.isclose(s1, s2))
 
 
-def test_dataframe_to_dict_error():
-    df = cudf.DataFrame({"a": [1, 2, 3], "b": [9, 5, 3]})
-    with pytest.raises(
-        TypeError,
-        match=re.escape(
-            r"cuDF does not support conversion to host memory "
-            r"via `to_dict()` method. Consider using "
-            r"`.to_pandas().to_dict()` to construct a Python dictionary."
+@pytest.mark.parametrize(
+    "orient", ["dict", "list", "split", "tight", "records", "index", "series"]
+)
+@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)])
+def test_dataframe_to_dict(orient, into):
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": [9, 5, 3]}, index=[10, 11, 12])
+    pdf = df.to_pandas()
+
+    actual = df.to_dict(orient=orient, into=into)
+    expected = pdf.to_dict(orient=orient, into=into)
+    if orient == "series":
+        assert actual.keys() == expected.keys()
+        for key in actual.keys():
+            assert_eq(expected[key], actual[key])
+    else:
+        assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "data, orient, dtype, columns",
+    [
+        (
+            {"col_1": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]},
+            "columns",
+            None,
+            None,
         ),
-    ):
-        df.to_dict()
+        ({"col_1": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]}, "index", None, None),
+        (
+            {"col_1": [None, 2, 1, 0], "col_2": [3, None, 1, 0]},
+            "index",
+            None,
+            ["A", "B", "C", "D"],
+        ),
+        (
+            {
+                "col_1": ["ab", "cd", "ef", "gh"],
+                "col_2": ["zx", "one", "two", "three"],
+            },
+            "index",
+            None,
+            ["A", "B", "C", "D"],
+        ),
+        (
+            {
+                "index": [("a", "b"), ("a", "c")],
+                "columns": [("x", 1), ("y", 2)],
+                "data": [[1, 3], [2, 4]],
+                "index_names": ["n1", "n2"],
+                "column_names": ["z1", "z2"],
+            },
+            "tight",
+            "float64",
+            None,
+        ),
+    ],
+)
+def test_dataframe_from_dict(data, orient, dtype, columns):
 
-    with pytest.raises(
-        TypeError,
-        match=re.escape(
-            r"cuDF does not support conversion to host memory "
-            r"via `to_dict()` method. Consider using "
-            r"`.to_pandas().to_dict()` to construct a Python dictionary."
+    expected = pd.DataFrame.from_dict(
+        data=data, orient=orient, dtype=dtype, columns=columns
+    )
+
+    actual = cudf.DataFrame.from_dict(
+        data=data, orient=orient, dtype=dtype, columns=columns
+    )
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize("dtype", ["int64", "str", None])
+def test_dataframe_from_dict_transposed(dtype):
+    pd_data = {"a": [3, 2, 1, 0], "col_2": [3, 2, 1, 0]}
+    gd_data = {key: cudf.Series(val) for key, val in pd_data.items()}
+
+    expected = pd.DataFrame.from_dict(pd_data, orient="index", dtype=dtype)
+    actual = cudf.DataFrame.from_dict(gd_data, orient="index", dtype=dtype)
+
+    gd_data = {key: cupy.asarray(val) for key, val in pd_data.items()}
+    actual = cudf.DataFrame.from_dict(gd_data, orient="index", dtype=dtype)
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "pd_data, gd_data, orient, dtype, columns",
+    [
+        (
+            {"col_1": np.array([3, 2, 1, 0]), "col_2": np.array([3, 2, 1, 0])},
+            {
+                "col_1": cupy.array([3, 2, 1, 0]),
+                "col_2": cupy.array([3, 2, 1, 0]),
+            },
+            "columns",
+            None,
+            None,
         ),
-    ):
-        df["a"].to_dict()
+        (
+            {"col_1": np.array([3, 2, 1, 0]), "col_2": np.array([3, 2, 1, 0])},
+            {
+                "col_1": cupy.array([3, 2, 1, 0]),
+                "col_2": cupy.array([3, 2, 1, 0]),
+            },
+            "index",
+            None,
+            None,
+        ),
+        (
+            {
+                "col_1": np.array([None, 2, 1, 0]),
+                "col_2": np.array([3, None, 1, 0]),
+            },
+            {
+                "col_1": cupy.array([np.nan, 2, 1, 0]),
+                "col_2": cupy.array([3, np.nan, 1, 0]),
+            },
+            "index",
+            None,
+            ["A", "B", "C", "D"],
+        ),
+        (
+            {
+                "col_1": np.array(["ab", "cd", "ef", "gh"]),
+                "col_2": np.array(["zx", "one", "two", "three"]),
+            },
+            {
+                "col_1": np.array(["ab", "cd", "ef", "gh"]),
+                "col_2": np.array(["zx", "one", "two", "three"]),
+            },
+            "index",
+            None,
+            ["A", "B", "C", "D"],
+        ),
+        (
+            {
+                "index": [("a", "b"), ("a", "c")],
+                "columns": [("x", 1), ("y", 2)],
+                "data": [np.array([1, 3]), np.array([2, 4])],
+                "index_names": ["n1", "n2"],
+                "column_names": ["z1", "z2"],
+            },
+            {
+                "index": [("a", "b"), ("a", "c")],
+                "columns": [("x", 1), ("y", 2)],
+                "data": [cupy.array([1, 3]), cupy.array([2, 4])],
+                "index_names": ["n1", "n2"],
+                "column_names": ["z1", "z2"],
+            },
+            "tight",
+            "float64",
+            None,
+        ),
+    ],
+)
+def test_dataframe_from_dict_cp_np_arrays(
+    pd_data, gd_data, orient, dtype, columns
+):
+
+    expected = pd.DataFrame.from_dict(
+        data=pd_data, orient=orient, dtype=dtype, columns=columns
+    )
+
+    actual = cudf.DataFrame.from_dict(
+        data=gd_data, orient=orient, dtype=dtype, columns=columns
+    )
+
+    assert_eq(expected, actual, check_dtype=dtype is not None)
 
 
 @pytest.mark.parametrize(
@@ -6860,6 +7095,7 @@ def test_series_keys(ps):
         assert_eq(ps.keys(), gds.keys())
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "df",
     [
@@ -6940,6 +7176,7 @@ def test_dataframe_append_dataframe(df, other, sort, ignore_index):
         assert_eq(expected, actual, check_index_type=not gdf.empty)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "df",
     [
@@ -6970,7 +7207,7 @@ def test_dataframe_append_dataframe(df, other, sort, ignore_index):
         pd.Series([10, 11, 23, 234, 13]),
         pytest.param(
             pd.Series([10, 11, 23, 234, 13], index=[11, 12, 13, 44, 33]),
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="pandas bug: "
                 "https://github.com/pandas-dev/pandas/issues/35092"
             ),
@@ -7023,6 +7260,7 @@ def test_dataframe_append_series_mixed_index():
         df.append(sr, ignore_index=True)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "df",
     [
@@ -7192,6 +7430,7 @@ def test_dataframe_ffill(df):
     assert_eq(expected, actual)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "df",
     [
@@ -7542,6 +7781,7 @@ def test_dataframe_init_with_columns(data, columns):
     )
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "data, ignore_dtype",
     [
@@ -7621,6 +7861,7 @@ def test_dataframe_init_from_series_list(data, ignore_dtype, columns):
         assert_eq(expected, actual, check_index_type=True)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "data, ignore_dtype, index",
     [
@@ -7791,6 +8032,7 @@ def test_dataframe_iterrows_itertuples():
         df.iterrows()
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "df",
     [
@@ -7823,7 +8065,7 @@ def test_dataframe_iterrows_itertuples():
                     ),
                 }
             ),
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="https://github.com/rapidsai/cudf/issues/6219"
             ),
         ),
@@ -7844,7 +8086,7 @@ def test_dataframe_iterrows_itertuples():
                     ),
                 }
             ),
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="https://github.com/rapidsai/cudf/issues/6219"
             ),
         ),
@@ -7868,6 +8110,7 @@ def test_describe_misc_include(df, include):
     assert_eq(expected, actual)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "df",
     [
@@ -7900,7 +8143,7 @@ def test_describe_misc_include(df, include):
                     ),
                 }
             ),
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="https://github.com/rapidsai/cudf/issues/6219"
             ),
         ),
@@ -7921,7 +8164,7 @@ def test_describe_misc_include(df, include):
                     ),
                 }
             ),
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="https://github.com/rapidsai/cudf/issues/6219"
             ),
         ),
@@ -8350,6 +8593,7 @@ def test_dataframe_constructor_column_index_only():
     ) == id(gdf["c"]._column)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "data",
     [
@@ -8451,6 +8695,7 @@ def test_agg_for_dataframe_with_string_columns(aggs):
         gdf.agg(aggs)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "join",
     ["left"],
@@ -8678,7 +8923,7 @@ def test_rename_for_level_RangeIndex_dataframe():
     assert_eq(expect, got)
 
 
-@pytest.mark.xfail(reason="level=None not implemented yet")
+@pytest_xfail(reason="level=None not implemented yet")
 def test_rename_for_level_is_None_MC():
     gdf = cudf.DataFrame({"a": [1, 2, 3], "b": [4, 5, 6], "c": [7, 8, 9]})
     gdf.columns = pd.MultiIndex.from_tuples([("a", 1), ("a", 2), ("b", 1)])
@@ -9111,7 +9356,7 @@ def test_groupby_cov_positive_semidefinite_matrix():
     )
 
 
-@pytest.mark.xfail
+@pytest_xfail
 def test_groupby_cov_for_pandas_bug_case():
     # Handles case: pandas bug using ddof with missing data.
     # Filed an issue in Pandas on GH, link below:
@@ -9265,6 +9510,7 @@ def test_dataframe_rename_duplicate_column():
         gdf.rename(columns={"a": "b"}, inplace=True)
 
 
+@pytest_unmark_spilling
 @pytest.mark.parametrize(
     "data",
     [
@@ -9507,14 +9753,14 @@ def test_multiindex_wildcard_selection_all(wildcard_df):
     assert_eq(expect, got)
 
 
-@pytest.mark.xfail(reason="Not yet properly supported.")
+@pytest_xfail(reason="Not yet properly supported.")
 def test_multiindex_wildcard_selection_partial(wildcard_df):
     expect = wildcard_df.to_pandas().loc[:, (slice("a", "b"), "b")]
     got = wildcard_df.loc[:, (slice("a", "b"), "b")]
     assert_eq(expect, got)
 
 
-@pytest.mark.xfail(reason="Not yet properly supported.")
+@pytest_xfail(reason="Not yet properly supported.")
 def test_multiindex_wildcard_selection_three_level_all():
     midx = cudf.MultiIndex.from_tuples(
         [(c1, c2, c3) for c1 in "abcd" for c2 in "abc" for c3 in "ab"]
diff --git a/python/cudf/cudf/tests/test_datetime.py b/python/cudf/cudf/tests/test_datetime.py
index 800a8aeeab5..bd3b3561701 100644
--- a/python/cudf/cudf/tests/test_datetime.py
+++ b/python/cudf/cudf/tests/test_datetime.py
@@ -28,7 +28,9 @@ def data1():
 
 
 def data2():
-    return pd.date_range("20010101", "20020215", freq="400h", name="times")
+    return pd.date_range(
+        "20010101", freq="243434324423423234N", name="times", periods=10
+    )
 
 
 def timeseries_us_data():
@@ -81,6 +83,8 @@ def numerical_data():
     "hour",
     "minute",
     "second",
+    "microsecond",
+    "nanosecond",
     "weekday",
     "dayofweek",
     "dayofyear",
@@ -172,7 +176,7 @@ def test_dt_ops(data):
 
 
 # libcudf doesn't respect timezones
-@pytest.mark.parametrize("data", [data1()])
+@pytest.mark.parametrize("data", [data1(), data2()])
 @pytest.mark.parametrize("field", fields)
 def test_dt_series(data, field):
     pd_data = pd.Series(data.copy())
@@ -182,7 +186,7 @@ def test_dt_series(data, field):
     assert_eq(base, test)
 
 
-@pytest.mark.parametrize("data", [data1()])
+@pytest.mark.parametrize("data", [data1(), data2()])
 @pytest.mark.parametrize("field", fields)
 def test_dt_index(data, field):
     pd_data = data.copy()
diff --git a/python/cudf/cudf/tests/test_decimal.py b/python/cudf/cudf/tests/test_decimal.py
index c37381a3af9..c7174adf342 100644
--- a/python/cudf/cudf/tests/test_decimal.py
+++ b/python/cudf/cudf/tests/test_decimal.py
@@ -1,4 +1,4 @@
-# Copyright (c) 2021, NVIDIA CORPORATION.
+# Copyright (c) 2021-2022, NVIDIA CORPORATION.
 
 import decimal
 from decimal import Decimal
@@ -377,3 +377,9 @@ def test_decimal_invalid_precision():
 
     with pytest.raises(pa.ArrowInvalid):
         _ = cudf.Series([Decimal("300")], dtype=cudf.Decimal64Dtype(2, 1))
+
+
+def test_decimal_overflow():
+    s = cudf.Series([Decimal("0.0009384233522166997927180531650178250")])
+    result = s * s
+    assert_eq(cudf.Decimal128Dtype(precision=38, scale=37), result.dtype)
diff --git a/python/cudf/cudf/tests/test_df_protocol.py b/python/cudf/cudf/tests/test_df_protocol.py
index 7b83eec9b63..6f8305e6751 100644
--- a/python/cudf/cudf/tests/test_df_protocol.py
+++ b/python/cudf/cudf/tests/test_df_protocol.py
@@ -7,7 +7,7 @@
 import pytest
 
 import cudf
-from cudf.core.buffer import Buffer
+from cudf.core.buffer import as_buffer
 from cudf.core.column import build_column
 from cudf.core.df_protocol import (
     DataFrameObject,
@@ -25,7 +25,7 @@ def assert_buffer_equal(buffer_and_dtype: Tuple[_CuDFBuffer, Any], cudfcol):
     device_id = cp.asarray(cudfcol.data).device.id
     assert buf.__dlpack_device__() == (2, device_id)
     col_from_buf = build_column(
-        Buffer(data=buf.ptr, size=buf.bufsize, owner=None),
+        as_buffer(data=buf.ptr, size=buf.bufsize),
         protocol_dtype_to_cupy_dtype(dtype),
     )
     # check that non null values are the equals as nulls are represented
diff --git a/python/cudf/cudf/tests/test_gcs.py b/python/cudf/cudf/tests/test_gcs.py
index f15d705c4e2..a677ace18ec 100644
--- a/python/cudf/cudf/tests/test_gcs.py
+++ b/python/cudf/cudf/tests/test_gcs.py
@@ -5,8 +5,6 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow as pa
-import pyarrow.orc
 import pytest
 
 import cudf
@@ -71,5 +69,5 @@ def mock_open(*args, **kwargs):
     monkeypatch.setattr(gcsfs.core.GCSFileSystem, "open", mock_open)
     gdf.to_orc(f"gcs://{gcs_fname}")
 
-    got = pa.orc.ORCFile(local_filepath).read().to_pandas()
+    got = pd.read_orc(local_filepath)
     assert_eq(pdf, got)
diff --git a/python/cudf/cudf/tests/test_groupby.py b/python/cudf/cudf/tests/test_groupby.py
index c4c8e81dda2..dd1f726c783 100644
--- a/python/cudf/cudf/tests/test_groupby.py
+++ b/python/cudf/cudf/tests/test_groupby.py
@@ -1456,7 +1456,7 @@ def test_groupby_attribute_error():
     class TestGroupBy(cudf.core.groupby.GroupBy):
         @property
         def _groupby(self):
-            raise AttributeError("Test error message")
+            raise AttributeError(err_msg)
 
     a = cudf.DataFrame({"a": [1, 2], "b": [2, 3]})
     gb = TestGroupBy(a, a["a"])
@@ -2718,3 +2718,36 @@ def test_groupby_group_keys(group_keys, by):
     actual = g_group[["B", "C"]].apply(lambda x: x / x.sum())
     expected = p_group[["B", "C"]].apply(lambda x: x / x.sum())
     assert_eq(actual, expected)
+
+
+@pytest.fixture
+def df_ngroup():
+    df = cudf.DataFrame(
+        {
+            "a": [2, 2, 1, 1, 2, 3],
+            "b": [1, 2, 1, 2, 1, 2],
+            "c": ["a", "a", "b", "c", "d", "c"],
+        },
+        index=[1, 3, 5, 7, 4, 2],
+    )
+    df.index.name = "foo"
+    return df
+
+
+@pytest.mark.parametrize(
+    "by",
+    [
+        lambda: "a",
+        lambda: "b",
+        lambda: ["a", "b"],
+        lambda: "c",
+        lambda: pd.Series([1, 2, 1, 2, 1, 2]),
+        lambda: pd.Series(["x", "y", "y", "x", "z", "x"]),
+    ],
+)
+@pytest.mark.parametrize("ascending", [True, False])
+def test_groupby_ngroup(by, ascending, df_ngroup):
+    by = by()
+    expected = df_ngroup.to_pandas().groupby(by).ngroup(ascending=ascending)
+    actual = df_ngroup.groupby(by).ngroup(ascending=ascending)
+    assert_eq(expected, actual, check_dtype=False)
diff --git a/python/cudf/cudf/tests/test_hdfs.py b/python/cudf/cudf/tests/test_hdfs.py
index 8730cb187b5..f8de16f8609 100644
--- a/python/cudf/cudf/tests/test_hdfs.py
+++ b/python/cudf/cudf/tests/test_hdfs.py
@@ -8,7 +8,6 @@
 import pandas as pd
 import pyarrow as pa
 import pytest
-from pyarrow import orc
 
 import cudf
 from cudf.testing._utils import assert_eq
@@ -212,7 +211,7 @@ def test_read_orc(datadir, hdfs, test_url):
         hd_fpath = f"hdfs://{basedir}/file.orc"
 
     got = cudf.read_orc(hd_fpath)
-    expect = orc.ORCFile(buffer).read().to_pandas()
+    expect = pd.read_orc(buffer)
     assert_eq(expect, got)
 
 
@@ -232,7 +231,7 @@ def test_write_orc(pdf, hdfs, test_url):
 
     assert hdfs.exists(f"{basedir}/test_orc_writer.orc")
     with hdfs.open(f"{basedir}/test_orc_writer.orc", mode="rb") as f:
-        got = orc.ORCFile(f).read().to_pandas()
+        got = pd.read_orc(f)
 
     assert_eq(pdf, got)
 
diff --git a/python/cudf/cudf/tests/test_index.py b/python/cudf/cudf/tests/test_index.py
index e8c568979a3..894c87add4b 100644
--- a/python/cudf/cudf/tests/test_index.py
+++ b/python/cudf/cudf/tests/test_index.py
@@ -2537,32 +2537,20 @@ def rangeindex(request):
     return RangeIndex(request.param)
 
 
-def test_rangeindex_nunique(rangeindex):
-    gidx = rangeindex
-    pidx = gidx.to_pandas()
-
-    actual = gidx.nunique()
-    expected = pidx.nunique()
-
-    assert_eq(expected, actual)
-
-
-def test_rangeindex_min(rangeindex):
-    gidx = rangeindex
-    pidx = gidx.to_pandas()
-
-    actual = gidx.min()
-    expected = pidx.min()
-
-    assert_eq(expected, actual)
-
-
-def test_rangeindex_max(rangeindex):
+@pytest.mark.parametrize(
+    "func",
+    ["nunique", "min", "max", "any", "values"],
+)
+def test_rangeindex_methods(rangeindex, func):
     gidx = rangeindex
     pidx = gidx.to_pandas()
 
-    actual = gidx.max()
-    expected = pidx.max()
+    if func == "values":
+        expected = pidx.values
+        actual = gidx.values
+    else:
+        expected = getattr(pidx, func)()
+        actual = getattr(gidx, func)()
 
     assert_eq(expected, actual)
 
@@ -2693,3 +2681,117 @@ def test_rangeindex_where_user_option(default_integer_bitwidth):
         dtype=f"int{default_integer_bitwidth}",
     )
     assert_eq(expected, actual)
+
+
+index_data = [
+    range(np.random.randint(0, 100)),
+    range(0, 10, -2),
+    range(0, -10, 2),
+    range(0, -10, -2),
+    range(0, 1),
+    [1, 2, 3, 1, None, None],
+    [None, None, 3.2, 1, None, None],
+    [None, "a", "3.2", "z", None, None],
+    pd.Series(["a", "b", None], dtype="category"),
+    np.array([1, 2, 3, None], dtype="datetime64[s]"),
+]
+
+
+@pytest.fixture(params=index_data)
+def index(request):
+    """Create a cudf Index of different dtypes"""
+    return cudf.Index(request.param)
+
+
+@pytest.mark.parametrize(
+    "func",
+    [
+        "to_series",
+        "isna",
+        "notna",
+        "append",
+    ],
+)
+def test_index_methods(index, func):
+    gidx = index
+    pidx = gidx.to_pandas()
+
+    if func == "append":
+        expected = pidx.append(other=pidx)
+        actual = gidx.append(other=gidx)
+    else:
+        expected = getattr(pidx, func)()
+        actual = getattr(gidx, func)()
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "idx, values",
+    [
+        (range(100, 1000, 10), [200, 600, 800]),
+        ([None, "a", "3.2", "z", None, None], ["a", "z"]),
+        (pd.Series(["a", "b", None], dtype="category"), [10, None]),
+    ],
+)
+def test_index_isin_values(idx, values):
+    gidx = cudf.Index(idx)
+    pidx = gidx.to_pandas()
+
+    actual = gidx.isin(values)
+    expected = pidx.isin(values)
+
+    assert_eq(expected, actual)
+
+
+@pytest.mark.parametrize(
+    "idx, scalar",
+    [
+        (range(0, -10, -2), -4),
+        ([None, "a", "3.2", "z", None, None], "x"),
+        (pd.Series(["a", "b", None], dtype="category"), 10),
+    ],
+)
+def test_index_isin_scalar_values(idx, scalar):
+    gidx = cudf.Index(idx)
+
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            f"only list-like objects are allowed to be passed "
+            f"to isin(), you passed a {type(scalar).__name__}"
+        ),
+    ):
+        gidx.isin(scalar)
+
+
+def test_index_any():
+    gidx = cudf.Index([1, 2, 3])
+    pidx = gidx.to_pandas()
+
+    assert_eq(pidx.any(), gidx.any())
+
+
+def test_index_values():
+    gidx = cudf.Index([1, 2, 3])
+    pidx = gidx.to_pandas()
+
+    assert_eq(pidx.values, gidx.values)
+
+
+def test_index_null_values():
+    gidx = cudf.Index([1.0, None, 3, 0, None])
+    with pytest.raises(ValueError):
+        gidx.values
+
+
+def test_index_error_list_index():
+    s = cudf.Series([[1, 2], [2], [4]])
+    with pytest.raises(
+        NotImplementedError,
+        match=re.escape(
+            "Unsupported column type passed to create an "
+            "Index: <class 'cudf.core.column.lists.ListColumn'>"
+        ),
+    ):
+        cudf.Index(s)
diff --git a/python/cudf/cudf/tests/test_indexing.py b/python/cudf/cudf/tests/test_indexing.py
index d726ba16e86..b4143f9e00a 100644
--- a/python/cudf/cudf/tests/test_indexing.py
+++ b/python/cudf/cudf/tests/test_indexing.py
@@ -1696,3 +1696,10 @@ def test_iloc_single_row_with_nullable_column():
 
     df.iloc[0]  # before the fix for #11349 this would segfault
     assert_eq(pdf.iloc[0], df.iloc[0])
+
+
+def test_loc_single_row_from_slice():
+    # see https://github.com/rapidsai/cudf/issues/11930
+    pdf = pd.DataFrame({"a": [10, 20, 30], "b": [1, 2, 3]}).set_index("a")
+    df = cudf.from_pandas(pdf)
+    assert_eq(pdf.loc[5:10], df.loc[5:10])
diff --git a/python/cudf/cudf/tests/test_json.py b/python/cudf/cudf/tests/test_json.py
index 1fdef44546a..2eda71c5c45 100644
--- a/python/cudf/cudf/tests/test_json.py
+++ b/python/cudf/cudf/tests/test_json.py
@@ -1,6 +1,7 @@
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import copy
+import gzip
 import itertools
 import os
 from io import BytesIO, StringIO
@@ -649,6 +650,24 @@ def test_json_nested_data():
     assert df.to_arrow().equals(pa_table_pdf)
 
 
+def test_json_empty_types():
+    json_str = """ {}
+    {"a": [], "b": {}}
+    {"a": []}
+    {"b": {}}
+    {"c": {"d": []}}
+    {"e": [{}]}
+    """
+    df = cudf.read_json(
+        StringIO(json_str),
+        engine="cudf_experimental",
+        orient="records",
+        lines=True,
+    )
+    pdf = pd.read_json(StringIO(json_str), orient="records", lines=True)
+    assert_eq(df, pdf)
+
+
 def test_json_types_data():
     # 0:<0:string,1:float>
     # 1:list<int>
@@ -669,6 +688,101 @@ def test_json_types_data():
     assert df.to_arrow().equals(pa_table_pdf)
 
 
+@pytest.mark.parametrize(
+    "col_type,json_str",
+    [
+        # without quotes
+        ("int", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("int", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("int", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # without quotes, null
+        ("int", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'),
+        # without quotes
+        ("float", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("float", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # with quotes, NAN
+        ("float", '[{"k": "1"}, {"k": "2"}, {"k": NaN}, {"k": "4"}]'),
+        # without quotes
+        ("str", '[{"k": 1}, {"k": 2}, {"k": 3}, {"k": 4}]'),
+        # with quotes
+        ("str", '[{"k": "1"}, {"k": "2"}]'),
+        # with quotes, mixed
+        ("str", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]'),
+        # with quotes, null, mixed
+        ("str", '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]'),
+        # without quotes, null
+        ("str", '[{"k": 1}, {"k": 2}, {"k": null}, {"k": 4}]'),
+    ],
+)
+def test_json_quoted_values_with_schema(col_type, json_str):
+    experimental_df = cudf.read_json(
+        StringIO(json_str),
+        engine="cudf_experimental",
+        orient="records",
+        dtype={"k": col_type},
+    )
+    cudf_df = cudf.read_json(
+        StringIO(json_str.replace(",", "\n")[1:-1]),
+        engine="cudf",
+        orient="records",
+        lines=True,
+        dtype={"k": col_type},
+    )
+    assert_eq(cudf_df, experimental_df)
+
+
+@pytest.mark.parametrize(
+    "col_type,json_str,expected",
+    [
+        # with quotes, mixed
+        ("int", '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]', [1, 2, 3, 4]),
+        # with quotes, null, mixed
+        (
+            "int",
+            '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]',
+            [1, 2, None, 4],
+        ),
+        # with quotes, mixed
+        (
+            "str",
+            '[{"k": "1"}, {"k": "2"}, {"k": 3}, {"k": 4}]',
+            ["1", "2", "3", "4"],
+        ),
+        # with quotes, null, mixed
+        (
+            "str",
+            '[{"k": "1"}, {"k": "2"}, {"k": null}, {"k": 4}]',
+            ["1", "2", None, "4"],
+        ),
+    ],
+)
+def test_json_quoted_values(col_type, json_str, expected):
+    experimental_df = cudf.read_json(
+        StringIO(json_str),
+        engine="cudf_experimental",
+        orient="records",
+        dtype={"k": col_type},
+    )
+    cudf_df = cudf.read_json(
+        StringIO(json_str.replace(",", "\n")[1:-1]),
+        engine="cudf",
+        orient="records",
+        lines=True,
+        dtype={"k": col_type},
+    )
+    assert_eq(expected, experimental_df.k.to_arrow().to_pylist())
+    assert_eq(expected, cudf_df.k.to_arrow().to_pylist())
+
+
 @pytest.mark.parametrize(
     "keep_quotes,result",
     [
@@ -784,3 +898,111 @@ def test_json_dtypes_nested_data():
         pdf, schema=df.to_arrow().schema, safe=False
     )
     assert df.to_arrow().equals(pa_table_pdf)
+
+
+@pytest.mark.parametrize(
+    "tag, data",
+    [
+        (
+            "normal",
+            """\
+{"a": 1, "b": 2}
+{"a": 3, "b": 4}""",
+        ),
+        (
+            "multiple",
+            """\
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 }""",
+        ),
+        (
+            "reordered",
+            """\
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "c": 12 , "b" : [4, 5   ]}
+    { "b" : [6      ],  "a": { "y" : 6}, "c": 13}
+    { "c" : 14, "a": { "y" : 6}, "b" : [7      ]}
+""",
+        ),
+        (
+            "missing",
+            """
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ]          }
+    { "a": { "y" : 6}, "c": 13                  }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14 }
+""",
+        ),
+        pytest.param(
+            "dtype_mismatch",
+            """\
+    { "a": { "y" : 6}, "b" : [1, 2, 3], "c": 11 }
+    { "a": { "y" : 6}, "b" : [4, 5   ], "c": 12 }
+    { "a": { "y" : 6}, "b" : [6      ], "c": 13 }
+    { "a": { "y" : 6}, "b" : [7      ], "c": 14.0 }""",
+        ),
+    ],
+)
+class TestNestedJsonReaderCommon:
+    @pytest.mark.parametrize("chunk_size", [10, 100, 1024, 1024 * 1024])
+    def test_chunked_nested_json_reader(self, tag, data, chunk_size):
+        expected = cudf.read_json(
+            StringIO(data), engine="cudf_experimental", lines=True
+        )
+
+        source_size = len(data)
+        chunks = []
+        for chunk_start in range(0, source_size, chunk_size):
+            chunks.append(
+                cudf.read_json(
+                    StringIO(data),
+                    engine="cudf_experimental",
+                    byte_range=[chunk_start, chunk_size],
+                    lines=True,
+                )
+            )
+        df = cudf.concat(chunks, ignore_index=True)
+        if tag == "missing" and chunk_size == 10:
+            with pytest.raises(AssertionError):
+                # nested JSON reader inferences integer with nulls as float64
+                assert expected.to_arrow().equals(df.to_arrow())
+        else:
+            assert expected.to_arrow().equals(df.to_arrow())
+
+    def test_order_nested_json_reader(self, tag, data):
+        expected = pd.read_json(StringIO(data), lines=True)
+        target = cudf.read_json(
+            StringIO(data), engine="cudf_experimental", lines=True
+        )
+        if tag == "dtype_mismatch":
+            with pytest.raises(AssertionError):
+                # pandas parses integer values in float representation
+                # as integer
+                assert pa.Table.from_pandas(expected).equals(target.to_arrow())
+        else:
+            assert pa.Table.from_pandas(expected).equals(target.to_arrow())
+
+
+def test_json_round_trip_gzip():
+    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["abc", "def", "ghi"]})
+    bytes = BytesIO()
+    with gzip.open(bytes, mode="wb") as fo:
+        df.to_json(fo, orient="records", lines=True)
+    bytes.seek(0)
+    with gzip.open(bytes, mode="rb") as fo:
+        written_df = cudf.read_json(fo, orient="records", lines=True)
+    assert_eq(written_df, df)
+
+    # Testing writing from middle of the file.
+    loc = bytes.tell()
+
+    with gzip.open(bytes, mode="wb") as fo:
+        fo.seek(loc)
+        df.to_json(fo, orient="records", lines=True)
+    bytes.seek(loc)
+    with gzip.open(bytes, mode="rb") as fo:
+        fo.seek(loc)
+        written_df = cudf.read_json(fo, orient="records", lines=True)
+    assert_eq(written_df, df)
diff --git a/python/cudf/cudf/tests/test_list.py b/python/cudf/cudf/tests/test_list.py
index 8ea11382419..4c2a14fc45c 100644
--- a/python/cudf/cudf/tests/test_list.py
+++ b/python/cudf/cudf/tests/test_list.py
@@ -864,6 +864,8 @@ def test_memory_usage():
     assert s1.memory_usage() == 44
     s2 = cudf.Series([[[[1, 2]]], [[[3, 4]]]])
     assert s2.memory_usage() == 68
+    s3 = cudf.Series([[{"b": 1, "a": 10}, {"b": 2, "a": 100}]])
+    assert s3.memory_usage() == 40
 
 
 @pytest.mark.parametrize(
diff --git a/python/cudf/cudf/tests/test_onehot.py b/python/cudf/cudf/tests/test_onehot.py
index 3c067975566..d42b0e85d28 100644
--- a/python/cudf/cudf/tests/test_onehot.py
+++ b/python/cudf/cudf/tests/test_onehot.py
@@ -10,6 +10,8 @@
 from cudf import DataFrame
 from cudf.testing import _utils as utils
 
+pytestmark = pytest.mark.spilling
+
 
 @pytest.mark.parametrize(
     "data, index",
diff --git a/python/cudf/cudf/tests/test_orc.py b/python/cudf/cudf/tests/test_orc.py
index 422c2588eb0..48f1a49b7f4 100644
--- a/python/cudf/cudf/tests/test_orc.py
+++ b/python/cudf/cudf/tests/test_orc.py
@@ -84,12 +84,8 @@ def _make_path_or_buf(src):
 )
 def test_orc_reader_basic(datadir, inputfile, columns, use_index, engine):
     path = datadir / inputfile
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    expect = orcfile.read(columns=columns).to_pandas()
+    expect = pd.read_orc(path, columns=columns)
     got = cudf.read_orc(
         path, engine=engine, columns=columns, use_index=use_index
     )
@@ -119,8 +115,7 @@ def test_orc_reader_local_filepath():
 def test_orc_reader_filepath_or_buffer(path_or_buf, src):
     cols = ["int1", "long1", "float1", "double1"]
 
-    orcfile = pa.orc.ORCFile(path_or_buf("filepath"))
-    expect = orcfile.read(columns=cols).to_pandas()
+    expect = pd.read_orc(path_or_buf("filepath"), columns=cols)
     got = cudf.read_orc(path_or_buf(src), columns=cols)
 
     assert_eq(expect, got)
@@ -128,12 +123,8 @@ def test_orc_reader_filepath_or_buffer(path_or_buf, src):
 
 def test_orc_reader_trailing_nulls(datadir):
     path = datadir / "TestOrcFile.nulls-at-end-snappy.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    expect = orcfile.read().to_pandas().fillna(0)
+    expect = pd.read_orc(path).fillna(0)
     got = cudf.read_orc(path).fillna(0)
 
     # PANDAS uses NaN to represent invalid data, which forces float dtype
@@ -164,12 +155,8 @@ def test_orc_reader_datetimestamp(datadir, inputfile, use_index):
 
 def test_orc_reader_strings(datadir):
     path = datadir / "TestOrcFile.testStringAndBinaryStatistics.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    expect = orcfile.read(columns=["string1"])
+    expect = pd.read_orc(path, columns=["string1"])
     got = cudf.read_orc(path, columns=["string1"])
 
     assert_eq(expect, got, check_categorical=False)
@@ -285,12 +272,8 @@ def test_orc_read_stripes(datadir, engine):
 @pytest.mark.parametrize("skiprows", [0, 1, 3000])
 def test_orc_read_rows(datadir, skiprows, num_rows):
     path = datadir / "TestOrcFile.decimal.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    pdf = orcfile.read().to_pandas()
+    pdf = pd.read_orc(path)
     gdf = cudf.read_orc(path, skiprows=skiprows, num_rows=num_rows)
 
     # Slice rows out of the whole dataframe for comparison as PyArrow doesn't
@@ -329,19 +312,17 @@ def test_orc_read_skiprows():
     # repro for other sizes of data
     skiprows = 10
 
-    expected = cudf.read_orc(buff)[skiprows:].reset_index(drop=True)
+    expected = (
+        pd.read_orc(buff)[skiprows:].reset_index(drop=True).astype("bool")
+    )
     got = cudf.read_orc(buff, skiprows=skiprows)
     assert_eq(expected, got)
 
 
 def test_orc_reader_uncompressed_block(datadir):
     path = datadir / "uncompressed_snappy.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    expect = orcfile.read().to_pandas()
+    expect = pd.read_orc(path)
     got = cudf.read_orc(path)
 
     assert_eq(expect, got, check_categorical=False)
@@ -349,15 +330,8 @@ def test_orc_reader_uncompressed_block(datadir):
 
 def test_orc_reader_nodata_block(datadir):
     path = datadir / "nodata.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
 
-    expect = orcfile.read().to_pandas()
+    expect = pd.read_orc(path)
     got = cudf.read_orc(path, num_rows=1)
 
     assert_eq(expect, got, check_categorical=False)
@@ -386,19 +360,9 @@ def test_orc_writer(datadir, tmpdir, reference_file, columns, compression):
     pdf_fname = datadir / reference_file
     gdf_fname = tmpdir.join("gdf.orc")
 
-    try:
-        orcfile = pa.orc.ORCFile(pdf_fname)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
-
-    expect = cudf.from_pandas(orcfile.read(columns=columns).to_pandas())
+    expect = cudf.from_pandas(pd.read_orc(pdf_fname, columns=columns))
     expect.to_orc(gdf_fname.strpath, compression=compression)
-    got = cudf.from_pandas(
-        pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
-    )
+    got = cudf.from_pandas(pd.read_orc(gdf_fname, columns=columns))
 
     assert_frame_equal(expect, got)
 
@@ -409,17 +373,9 @@ def test_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
     pdf_fname = datadir / reference_file
     gdf_fname = tmpdir.join("gdf.orc")
 
-    try:
-        orcfile = pa.orc.ORCFile(pdf_fname)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
-
-    expect = cudf.from_pandas(orcfile.read().to_pandas())
+    expect = cudf.from_pandas(pd.read_orc(pdf_fname))
     expect.to_orc(gdf_fname.strpath, statistics=stats_freq)
-    got = cudf.from_pandas(pa.orc.ORCFile(gdf_fname).read().to_pandas())
+    got = cudf.from_pandas(pd.read_orc(gdf_fname))
 
     assert_frame_equal(expect, got)
 
@@ -430,14 +386,6 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
     pdf_fname = datadir / reference_file
     gdf_fname = tmpdir.join("chunked_gdf.orc")
 
-    try:
-        orcfile = pa.orc.ORCFile(pdf_fname)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
-
     columns = [
         "boolean1",
         "byte1",
@@ -447,7 +395,7 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
         "float1",
         "double1",
     ]
-    pdf = orcfile.read(columns=columns).to_pandas()
+    pdf = pd.read_orc(pdf_fname, columns=columns)
     gdf = cudf.from_pandas(pdf)
     expect = pd.concat([pdf, pdf]).reset_index(drop=True)
 
@@ -456,7 +404,7 @@ def test_chunked_orc_writer_statistics_frequency(datadir, tmpdir, stats_freq):
     writer.write_table(gdf)
     writer.close()
 
-    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+    got = pd.read_orc(gdf_fname)
 
     assert_eq(expect, got)
 
@@ -486,15 +434,7 @@ def test_chunked_orc_writer(
     pdf_fname = datadir / reference_file
     gdf_fname = tmpdir.join("chunked_gdf.orc")
 
-    try:
-        orcfile = pa.orc.ORCFile(pdf_fname)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
-
-    pdf = orcfile.read(columns=columns).to_pandas()
+    pdf = pd.read_orc(pdf_fname, columns=columns)
     gdf = cudf.from_pandas(pdf)
     expect = pd.concat([pdf, pdf]).reset_index(drop=True)
 
@@ -503,7 +443,7 @@ def test_chunked_orc_writer(
     writer.write_table(gdf)
     writer.close()
 
-    got = pa.orc.ORCFile(gdf_fname).read(columns=columns).to_pandas()
+    got = pd.read_orc(gdf_fname, columns=columns)
     assert_frame_equal(cudf.from_pandas(expect), cudf.from_pandas(got))
 
 
@@ -521,7 +461,7 @@ def test_orc_writer_strings(tmpdir, dtypes):
 
     expect = cudf.datasets.randomdata(nrows=10, dtypes=dtypes, seed=1)
     expect.to_orc(gdf_fname)
-    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+    got = pd.read_orc(gdf_fname)
 
     assert_eq(expect, got)
 
@@ -546,7 +486,7 @@ def test_chunked_orc_writer_strings(tmpdir, dtypes):
     writer.write_table(gdf)
     writer.close()
 
-    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+    got = pd.read_orc(gdf_fname)
 
     assert_eq(expect, got)
 
@@ -577,13 +517,8 @@ def test_orc_writer_sliced(tmpdir):
 def test_orc_reader_decimal_type(datadir, orc_file):
     file_path = datadir / orc_file
 
-    try:
-        orcfile = pa.orc.ORCFile(file_path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
-
-    pdf = orcfile.read().to_pandas()
-    df = cudf.read_orc(file_path).to_pandas()
+    pdf = pd.read_orc(file_path)
+    df = cudf.read_orc(file_path)
 
     assert_eq(pdf, df)
 
@@ -591,13 +526,8 @@ def test_orc_reader_decimal_type(datadir, orc_file):
 def test_orc_decimal_precision_fail(datadir):
     file_path = datadir / "TestOrcFile.int_decimal.precision_19.orc"
 
-    try:
-        orcfile = pa.orc.ORCFile(file_path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
-
     # Shouldn't cause failure if decimal column is not chosen to be read.
-    pdf = orcfile.read(columns=["int"]).to_pandas()
+    pdf = pd.read_orc(file_path, columns=["int"])
     gdf = cudf.read_orc(file_path, columns=["int"])
 
     assert_eq(pdf, gdf)
@@ -624,13 +554,9 @@ def test_orc_reader_tzif_timestamps(datadir):
     # Contains timstamps in the range covered by the TZif file
     # Other timedate tests only cover "future" times
     path = datadir / "TestOrcFile.lima_timezone.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(path).to_pandas()
+    pdf = pd.read_orc(path)
+    gdf = cudf.read_orc(path)
 
     assert_eq(pdf, gdf)
 
@@ -882,13 +808,9 @@ def test_orc_write_bool_statistics(tmpdir, datadir, nrows):
 
 def test_orc_reader_gmt_timestamps(datadir):
     path = datadir / "TestOrcFile.gmt.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(path).to_pandas()
+    pdf = pd.read_orc(path)
+    gdf = cudf.read_orc(path)
     assert_eq(pdf, gdf)
 
 
@@ -914,7 +836,7 @@ def test_orc_bool_encode_fail():
     okay_df.to_orc(buffer)
 
     # Also validate data
-    pdf = pa.orc.ORCFile(buffer).read().to_pandas()
+    pdf = pd.read_orc(buffer)
 
     assert_eq(okay_df.to_pandas(nullable=True), pdf)
 
@@ -929,8 +851,8 @@ def test_nanoseconds_overflow():
     cudf_got = cudf.read_orc(buffer)
     assert_eq(expected, cudf_got)
 
-    pyarrow_got = pa.orc.ORCFile(buffer).read()
-    assert_eq(expected.to_pandas(), pyarrow_got.to_pandas())
+    pandas_got = pd.read_orc(buffer)
+    assert_eq(expected, pandas_got)
 
 
 def test_empty_dataframe():
@@ -1207,7 +1129,7 @@ def test_skip_rows_for_nested_types(columns, list_struct_buff):
 def test_pyspark_struct(datadir):
     path = datadir / "TestOrcFile.testPySparkStruct.orc"
 
-    pdf = pa.orc.ORCFile(path).read().to_pandas()
+    pdf = pd.read_orc(path)
     gdf = cudf.read_orc(path)
 
     assert_eq(pdf, gdf)
@@ -1391,13 +1313,9 @@ def test_map_type_read(columns, num_rows, use_index):
 
 def test_orc_reader_decimal(datadir):
     path = datadir / "TestOrcFile.decimal.orc"
-    try:
-        orcfile = pa.orc.ORCFile(path)
-    except pa.ArrowIOError as e:
-        pytest.skip(".orc file is not found: %s" % e)
 
-    pdf = orcfile.read().to_pandas()
-    gdf = cudf.read_orc(path).to_pandas()
+    pdf = pd.read_orc(path)
+    gdf = cudf.read_orc(path)
 
     assert_eq(pdf, gdf)
 
@@ -1478,7 +1396,7 @@ def test_orc_writer_lists(data):
         buffer, stripe_size_rows=2048, row_index_stride=512
     )
 
-    pdf_out = pa.orc.ORCFile(buffer).read().to_pandas()
+    pdf_out = pd.read_orc(buffer)
     assert_eq(pdf_out, pdf_in)
 
 
@@ -1500,7 +1418,7 @@ def test_chunked_orc_writer_lists():
     writer.write_table(gdf)
     writer.close()
 
-    got = pa.orc.ORCFile(buffer).read().to_pandas()
+    got = pd.read_orc(buffer)
     assert_eq(expect, got)
 
 
@@ -1508,17 +1426,9 @@ def test_writer_timestamp_stream_size(datadir, tmpdir):
     pdf_fname = datadir / "TestOrcFile.largeTimestamps.orc"
     gdf_fname = tmpdir.join("gdf.orc")
 
-    try:
-        orcfile = pa.orc.ORCFile(pdf_fname)
-    except Exception as excpr:
-        if type(excpr).__name__ == "ArrowIOError":
-            pytest.skip(".orc file is not found")
-        else:
-            print(type(excpr).__name__)
-
-    expect = orcfile.read().to_pandas()
+    expect = pd.read_orc(pdf_fname)
     cudf.from_pandas(expect).to_orc(gdf_fname.strpath)
-    got = pa.orc.ORCFile(gdf_fname).read().to_pandas()
+    got = pd.read_orc(gdf_fname)
 
     assert_eq(expect, got)
 
@@ -1555,7 +1465,6 @@ def test_names_in_struct_dtype_nesting(datadir):
     assert edf.dtypes.equals(got.dtypes)
 
 
-@pytest.mark.filterwarnings("ignore:.*struct.*experimental")
 def test_writer_lists_structs(list_struct_buff):
     df_in = cudf.read_orc(list_struct_buff)
 
@@ -1567,7 +1476,6 @@ def test_writer_lists_structs(list_struct_buff):
     assert pyarrow_tbl.equals(df_in.to_arrow())
 
 
-@pytest.mark.filterwarnings("ignore:.*struct.*experimental")
 @pytest.mark.parametrize(
     "data",
     [
@@ -1593,7 +1501,7 @@ def test_orc_writer_lists_empty_rg(data):
     df = cudf.read_orc(buffer)
     assert_eq(df, cudf_in)
 
-    pdf_out = pa.orc.ORCFile(buffer).read().to_pandas()
+    pdf_out = pd.read_orc(buffer)
     assert_eq(pdf_in, pdf_out)
 
 
@@ -1668,7 +1576,6 @@ def test_empty_statistics():
         assert stats[0]["i"].get("sum") == 1
 
 
-@pytest.mark.filterwarnings("ignore:.*struct.*experimental")
 @pytest.mark.parametrize(
     "equivalent_columns",
     [
@@ -1699,7 +1606,7 @@ def test_orc_writer_rle_stream_size(datadir, tmpdir):
 
     # Segfaults when RLE stream sizes don't account for varint length
     pa_out = pa.orc.ORCFile(reencoded).read()
-    assert_eq(df.to_pandas(), pa_out)
+    assert df.to_arrow().equals(pa_out)
 
 
 def test_empty_columns():
@@ -1762,11 +1669,26 @@ def test_orc_writer_nvcomp(compression):
         assert_eq(expected, got)
 
 
+@pytest.mark.parametrize("index_obj", [None, [10, 11, 12], ["x", "y", "z"]])
 @pytest.mark.parametrize("index", [True, False, None])
-@pytest.mark.parametrize("columns", [None, [], ["b", "a"]])
-def test_orc_columns_and_index_param(index, columns):
+@pytest.mark.parametrize(
+    "columns",
+    [
+        None,
+        [],
+        pytest.param(
+            ["b", "a"],
+            marks=pytest.mark.xfail(
+                reason="https://github.com/rapidsai/cudf/issues/12026"
+            ),
+        ),
+    ],
+)
+def test_orc_columns_and_index_param(index_obj, index, columns):
     buffer = BytesIO()
-    df = cudf.DataFrame({"a": [1, 2, 3], "b": ["a", "b", "c"]})
+    df = cudf.DataFrame(
+        {"a": [1, 2, 3], "b": ["a", "b", "c"]}, index=index_obj
+    )
     df.to_orc(buffer, index=index)
 
     expected = pd.read_orc(buffer, columns=columns)
@@ -1893,3 +1815,22 @@ def test_statistics_string_sum():
 
     file_stats, stripe_stats = cudf.io.orc.read_orc_statistics([buff])
     assert_eq(file_stats[0]["str"].get("sum"), sum(len(s) for s in strings))
+
+
+@pytest.mark.parametrize(
+    "fname",
+    [
+        "TestOrcFile.Hive.OneEmptyMap.orc",
+        "TestOrcFile.Hive.OneEmptyList.orc",
+        "TestOrcFile.Hive.OneNullStruct.orc",
+        "TestOrcFile.Hive.EmptyListStripe.orc",
+        "TestOrcFile.Hive.NullStructStripe.orc",
+        "TestOrcFile.Hive.AllNulls.orc",
+    ],
+)
+def test_reader_empty_stripe(datadir, fname):
+    path = datadir / fname
+
+    expected = pd.read_orc(path)
+    got = cudf.read_orc(path)
+    assert_eq(expected, got)
diff --git a/python/cudf/cudf/tests/test_parquet.py b/python/cudf/cudf/tests/test_parquet.py
index 022f7cdd6f7..6a55fece6ff 100644
--- a/python/cudf/cudf/tests/test_parquet.py
+++ b/python/cudf/cudf/tests/test_parquet.py
@@ -1296,6 +1296,80 @@ def string_list_gen_wrapped(x, y):
     assert expect.equals(got.to_arrow())
 
 
+def test_parquet_reader_v2(tmpdir, simple_pdf):
+    pdf_fname = tmpdir.join("pdfv2.parquet")
+    simple_pdf.to_parquet(pdf_fname, data_page_version="2.0")
+    assert_eq(cudf.read_parquet(pdf_fname), simple_pdf)
+
+
+@pytest.mark.parametrize(
+    "data",
+    [
+        # Structs
+        {
+            "being": [
+                None,
+                {"human?": True, "Deets": {"Name": "Carrot", "Age": 27}},
+                {"human?": None, "Deets": {"Name": "Angua", "Age": 25}},
+                {"human?": False, "Deets": {"Name": "Cheery", "Age": 31}},
+                {"human?": False, "Deets": None},
+                {"human?": None, "Deets": {"Name": "Mr", "Age": None}},
+            ]
+        },
+        # List of Structs
+        {
+            "family": [
+                [None, {"human?": True, "deets": {"weight": 2.4, "age": 27}}],
+                [
+                    {"human?": None, "deets": {"weight": 5.3, "age": 25}},
+                    {"human?": False, "deets": {"weight": 8.0, "age": 31}},
+                    {"human?": False, "deets": None},
+                ],
+                [],
+                [{"human?": None, "deets": {"weight": 6.9, "age": None}}],
+            ]
+        },
+        # Struct of Lists
+        {
+            "Real estate records": [
+                None,
+                {
+                    "Status": "NRI",
+                    "Ownerships": {
+                        "land_unit": [None, 2, None],
+                        "flats": [[1, 2, 3], [], [4, 5], [], [0, 6, 0]],
+                    },
+                },
+                {
+                    "Status": None,
+                    "Ownerships": {
+                        "land_unit": [4, 5],
+                        "flats": [[7, 8], []],
+                    },
+                },
+                {
+                    "Status": "RI",
+                    "Ownerships": {"land_unit": None, "flats": [[]]},
+                },
+                {"Status": "RI", "Ownerships": None},
+                {
+                    "Status": None,
+                    "Ownerships": {
+                        "land_unit": [7, 8, 9],
+                        "flats": [[], [], []],
+                    },
+                },
+            ]
+        },
+    ],
+)
+def test_parquet_reader_nested_v2(tmpdir, data):
+    expect = pd.DataFrame(data)
+    pdf_fname = tmpdir.join("pdfv2.parquet")
+    expect.to_parquet(pdf_fname, data_page_version="2.0")
+    assert_eq(cudf.read_parquet(pdf_fname), expect)
+
+
 @pytest.mark.filterwarnings("ignore:Using CPU")
 def test_parquet_writer_cpu_pyarrow(
     tmpdir, pdf_day_timestamps, gdf_day_timestamps
@@ -2206,6 +2280,8 @@ def test_parquet_writer_statistics(tmpdir, pdf, add_nulls):
         # pandas which interferes with series.max()/min()
         for t in TIMEDELTA_TYPES:
             pdf["col_" + t] = pd.Series(np.arange(len(pdf.index))).astype(t)
+        # pyarrow can't read values with non-zero nanoseconds
+        pdf["col_timedelta64[ns]"] = pdf["col_timedelta64[ns]"] * 1000
 
     gdf = cudf.from_pandas(pdf)
     if add_nulls:
@@ -2582,9 +2658,63 @@ def test_parquet_writer_zstd():
 
     buff = BytesIO()
     try:
-        expected.to_orc(buff, compression="ZSTD")
+        expected.to_parquet(buff, compression="ZSTD")
     except RuntimeError:
         pytest.mark.xfail(reason="Newer nvCOMP version is required")
     else:
-        got = pd.read_orc(buff)
+        got = pd.read_parquet(buff)
         assert_eq(expected, got)
+
+
+def test_parquet_writer_time_delta_physical_type():
+    df = cudf.DataFrame(
+        {
+            "s": cudf.Series([1], dtype="timedelta64[s]"),
+            "ms": cudf.Series([2], dtype="timedelta64[ms]"),
+            "us": cudf.Series([3], dtype="timedelta64[us]"),
+            # 4K because Pandas/pyarrow don't support non-zero nanoseconds
+            # in Parquet files
+            "ns": cudf.Series([4000], dtype="timedelta64[ns]"),
+        }
+    )
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+
+    got = pd.read_parquet(buffer)
+    expected = pd.DataFrame(
+        {
+            "s": ["00:00:01"],
+            "ms": ["00:00:00.002000"],
+            "us": ["00:00:00.000003"],
+            "ns": ["00:00:00.000004"],
+        },
+        dtype="str",
+    )
+    assert_eq(got.astype("str"), expected)
+
+
+def test_parquet_roundtrip_time_delta():
+    num_rows = 12345
+    df = cudf.DataFrame(
+        {
+            "s": cudf.Series(
+                random.sample(range(0, 200000), num_rows),
+                dtype="timedelta64[s]",
+            ),
+            "ms": cudf.Series(
+                random.sample(range(0, 200000), num_rows),
+                dtype="timedelta64[ms]",
+            ),
+            "us": cudf.Series(
+                random.sample(range(0, 200000), num_rows),
+                dtype="timedelta64[us]",
+            ),
+            "ns": cudf.Series(
+                random.sample(range(0, 200000), num_rows),
+                dtype="timedelta64[ns]",
+            ),
+        }
+    )
+    buffer = BytesIO()
+    df.to_parquet(buffer)
+    assert_eq(df, cudf.read_parquet(buffer))
diff --git a/python/cudf/cudf/tests/test_pickling.py b/python/cudf/cudf/tests/test_pickling.py
index 1427a214a72..8ce818e7a3d 100644
--- a/python/cudf/cudf/tests/test_pickling.py
+++ b/python/cudf/cudf/tests/test_pickling.py
@@ -1,22 +1,16 @@
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
-import sys
+import pickle
 
 import numpy as np
 import pandas as pd
 import pytest
 
 from cudf import DataFrame, GenericIndex, RangeIndex, Series
-from cudf.core.buffer import as_device_buffer_like
+from cudf.core.buffer import as_buffer
 from cudf.testing._utils import assert_eq
 
-if sys.version_info < (3, 8):
-    try:
-        import pickle5 as pickle
-    except ImportError:
-        import pickle
-else:
-    import pickle
+pytestmark = pytest.mark.spilling
 
 
 def check_serialization(df):
@@ -97,7 +91,7 @@ def test_pickle_index():
 
 def test_pickle_buffer():
     arr = np.arange(10).view("|u1")
-    buf = as_device_buffer_like(arr)
+    buf = as_buffer(arr)
     assert buf.size == arr.nbytes
     pickled = pickle.dumps(buf)
     unpacked = pickle.loads(pickled)
diff --git a/python/cudf/cudf/tests/test_quantiles.py b/python/cudf/cudf/tests/test_quantiles.py
index c4e3f690d13..72b36d8a1a6 100644
--- a/python/cudf/cudf/tests/test_quantiles.py
+++ b/python/cudf/cudf/tests/test_quantiles.py
@@ -16,7 +16,7 @@ def test_single_q():
     gdf = cudf.from_pandas(pdf)
 
     pdf_q = pdf.quantile(q, interpolation="nearest")
-    gdf_q = gdf.quantiles(q, interpolation="nearest")
+    gdf_q = gdf.quantile(q, interpolation="nearest", method="table")
 
     assert_eq(pdf_q, gdf_q, check_index_type=False)
 
@@ -28,7 +28,7 @@ def test_with_index():
     gdf = cudf.from_pandas(pdf)
 
     pdf_q = pdf.quantile(q, interpolation="nearest")
-    gdf_q = gdf.quantiles(q, interpolation="nearest")
+    gdf_q = gdf.quantile(q, interpolation="nearest", method="table")
 
     assert_eq(pdf_q, gdf_q, check_index_type=False)
 
@@ -48,7 +48,7 @@ def test_with_multiindex():
     gdf = cudf.from_pandas(pdf)
 
     pdf_q = pdf.quantile(q, interpolation="nearest")
-    gdf_q = gdf.quantiles(q, interpolation="nearest")
+    gdf_q = gdf.quantile(q, interpolation="nearest", method="table")
 
     assert_eq(pdf_q, gdf_q, check_index_type=False)
 
diff --git a/python/cudf/cudf/tests/test_reshape.py b/python/cudf/cudf/tests/test_reshape.py
index df03104eda4..6336565af52 100644
--- a/python/cudf/cudf/tests/test_reshape.py
+++ b/python/cudf/cudf/tests/test_reshape.py
@@ -9,6 +9,7 @@
 import cudf
 from cudf import melt as cudf_melt
 from cudf.core._compat import PANDAS_GE_120
+from cudf.core.buffer.spill_manager import get_global_manager
 from cudf.testing._utils import (
     ALL_TYPES,
     DATETIME_TYPES,
@@ -16,6 +17,18 @@
     assert_eq,
 )
 
+pytest_xfail = pytest.mark.xfail
+pytestmark = pytest.mark.spilling
+
+# If spilling is enabled globally, we skip many test permutations
+# to reduce running time.
+if get_global_manager() is not None:
+    ALL_TYPES = ["float32"]  # noqa: F811
+    DATETIME_TYPES = ["datetime64[ms]"]  # noqa: F811
+    NUMERIC_TYPES = ["float32"]  # noqa: F811
+    # To save time, we skip tests marked "pytest.mark.xfail"
+    pytest_xfail = pytest.mark.skipif
+
 
 @pytest.mark.parametrize("num_id_vars", [0, 1, 2])
 @pytest.mark.parametrize("num_value_vars", [0, 1, 2])
@@ -78,7 +91,7 @@ def test_melt(nulls, num_id_vars, num_value_vars, num_rows, dtype):
     + [
         pytest.param(
             "str",
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 condition=not PANDAS_GE_120, reason="pandas bug"
             ),
         )
@@ -441,7 +454,7 @@ def test_pivot_values(values):
         0,
         pytest.param(
             1,
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="Categorical column indexes not supported"
             ),
         ),
@@ -449,7 +462,7 @@ def test_pivot_values(values):
         "foo",
         pytest.param(
             "bar",
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="Categorical column indexes not supported"
             ),
         ),
@@ -457,24 +470,24 @@ def test_pivot_values(values):
         [],
         pytest.param(
             [0, 1],
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="Categorical column indexes not supported"
             ),
         ),
         ["foo"],
         pytest.param(
             ["foo", "bar"],
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="Categorical column indexes not supported"
             ),
         ),
         pytest.param(
             [0, 1, 2],
-            marks=pytest.mark.xfail(reason="Pandas behaviour unclear"),
+            marks=pytest_xfail(reason="Pandas behaviour unclear"),
         ),
         pytest.param(
             ["foo", "bar", "baz"],
-            marks=pytest.mark.xfail(reason="Pandas behaviour unclear"),
+            marks=pytest_xfail(reason="Pandas behaviour unclear"),
         ),
     ],
 )
@@ -506,7 +519,7 @@ def test_unstack_multiindex(level):
         pd.Index(range(0, 5), name="row_index"),
         pytest.param(
             pd.CategoricalIndex(["d", "e", "f", "g", "h"]),
-            marks=pytest.mark.xfail(
+            marks=pytest_xfail(
                 reason="Categorical column indexes not supported"
             ),
         ),
@@ -596,6 +609,42 @@ def test_pivot_table_simple(data, aggfunc, fill_value):
     assert_eq(expected, actual, check_dtype=False)
 
 
+@pytest.mark.parametrize(
+    "data",
+    [
+        {
+            "A": ["one", "one", "two", "three"] * 6,
+            "B": ["A", "B", "C"] * 8,
+            "C": ["foo", "foo", "foo", "bar", "bar", "bar"] * 4,
+            "D": np.random.randn(24),
+            "E": np.random.randn(24),
+        }
+    ],
+)
+@pytest.mark.parametrize(
+    "aggfunc", ["mean", "count", {"D": "sum", "E": "count"}]
+)
+@pytest.mark.parametrize("fill_value", [0])
+def test_dataframe_pivot_table_simple(data, aggfunc, fill_value):
+    pdf = pd.DataFrame(data)
+    expected = pdf.pivot_table(
+        values=["D", "E"],
+        index=["A", "B"],
+        columns=["C"],
+        aggfunc=aggfunc,
+        fill_value=fill_value,
+    )
+    cdf = cudf.DataFrame(data)
+    actual = cdf.pivot_table(
+        values=["D", "E"],
+        index=["A", "B"],
+        columns=["C"],
+        aggfunc=aggfunc,
+        fill_value=fill_value,
+    )
+    assert_eq(expected, actual, check_dtype=False)
+
+
 def test_crosstab_simple():
     a = np.array(
         [
diff --git a/python/cudf/cudf/tests/test_s3.py b/python/cudf/cudf/tests/test_s3.py
index 5c06dea4ca6..de3bba25223 100644
--- a/python/cudf/cudf/tests/test_s3.py
+++ b/python/cudf/cudf/tests/test_s3.py
@@ -7,9 +7,7 @@
 
 import numpy as np
 import pandas as pd
-import pyarrow as pa
 import pyarrow.fs as pa_fs
-import pyarrow.orc
 import pytest
 from fsspec.core import get_fs_token_paths
 
@@ -298,7 +296,6 @@ def test_read_parquet_ext(
             f"s3://{bucket}/{fname}",
             storage_options=s3so,
             bytes_per_thread=bytes_per_thread,
-            footer_sample_size=3200,
             columns=columns,
         )
     if index:
@@ -443,7 +440,7 @@ def test_read_orc(s3_base, s3so, datadir, use_python_file_object, columns):
     source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
     fname = "test_orc_reader.orc"
     bucket = "orc"
-    expect = pa.orc.ORCFile(source_file).read().to_pandas()
+    expect = pd.read_orc(source_file)
 
     with open(source_file, "rb") as f:
         buffer = f.read()
@@ -466,7 +463,7 @@ def test_read_orc_arrow_nativefile(s3_base, s3so, datadir, columns):
     source_file = str(datadir / "orc" / "TestOrcFile.testSnappy.orc")
     fname = "test_orc_reader.orc"
     bucket = "orc"
-    expect = pa.orc.ORCFile(source_file).read().to_pandas()
+    expect = pd.read_orc(source_file)
 
     with open(source_file, "rb") as f:
         buffer = f.read()
@@ -492,7 +489,7 @@ def test_write_orc(s3_base, s3so, pdf):
         assert s3fs.exists(f"s3://{bucket}/{fname}")
 
         with s3fs.open(f"s3://{bucket}/{fname}") as f:
-            got = pa.orc.ORCFile(f).read().to_pandas()
+            got = pd.read_orc(f)
 
     assert_eq(pdf, got)
 
diff --git a/python/cudf/cudf/tests/test_series.py b/python/cudf/cudf/tests/test_series.py
index c0b99f56238..2525f055738 100644
--- a/python/cudf/cudf/tests/test_series.py
+++ b/python/cudf/cudf/tests/test_series.py
@@ -3,6 +3,7 @@
 import hashlib
 import operator
 import re
+from collections import OrderedDict, defaultdict
 from string import ascii_letters, digits
 
 import cupy as cp
@@ -1614,6 +1615,47 @@ def test_axes(data):
         assert_eq(e, a)
 
 
+def test_series_truncate():
+    csr = cudf.Series([1, 2, 3, 4])
+    psr = csr.to_pandas()
+
+    assert_eq(csr.truncate(), psr.truncate())
+    assert_eq(csr.truncate(1, 2), psr.truncate(1, 2))
+    assert_eq(csr.truncate(before=1, after=2), psr.truncate(before=1, after=2))
+
+
+def test_series_truncate_errors():
+    csr = cudf.Series([1, 2, 3, 4])
+    with pytest.raises(ValueError):
+        csr.truncate(axis=1)
+    with pytest.raises(ValueError):
+        csr.truncate(copy=False)
+
+    csr.index = [3, 2, 1, 6]
+    psr = csr.to_pandas()
+    assert_exceptions_equal(
+        lfunc=csr.truncate,
+        rfunc=psr.truncate,
+    )
+
+
+def test_series_truncate_datetimeindex():
+    dates = cudf.date_range(
+        "2021-01-01 23:45:00", "2021-01-02 23:46:00", freq="s"
+    )
+    csr = cudf.Series(range(len(dates)), index=dates)
+    psr = csr.to_pandas()
+
+    assert_eq(
+        csr.truncate(
+            before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+        ),
+        psr.truncate(
+            before="2021-01-01 23:45:18", after="2021-01-01 23:45:27"
+        ),
+    )
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -1951,3 +1993,20 @@ def test_set_bool_error(dtype, bool_scalar):
         lfunc_args_and_kwargs=([bool_scalar],),
         rfunc_args_and_kwargs=([bool_scalar],),
     )
+
+
+def test_int64_equality():
+    s = cudf.Series(np.asarray([2**63 - 10, 2**63 - 100], dtype=np.int64))
+    assert (s != np.int64(2**63 - 1)).all()
+    assert (s != cudf.Scalar(2**63 - 1, dtype=np.int64)).all()
+
+
+@pytest.mark.parametrize("into", [dict, OrderedDict, defaultdict(list)])
+def test_series_to_dict(into):
+    gs = cudf.Series(["ab", "de", "zx"], index=[10, 20, 100])
+    ps = gs.to_pandas()
+
+    actual = gs.to_dict(into=into)
+    expected = ps.to_dict(into=into)
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_setitem.py b/python/cudf/cudf/tests/test_setitem.py
index 13b342e6c3b..0298a62b9d2 100644
--- a/python/cudf/cudf/tests/test_setitem.py
+++ b/python/cudf/cudf/tests/test_setitem.py
@@ -108,6 +108,16 @@ def test_series_set_item(psr, arg):
     assert_eq(psr, gsr)
 
 
+def test_series_setitem_singleton_range():
+    sr = cudf.Series([1, 2, 3], dtype=np.int64)
+    psr = sr.to_pandas()
+    value = np.asarray([7], dtype=np.int64)
+    sr.iloc[:1] = value
+    psr.iloc[:1] = value
+    assert_eq(sr, cudf.Series([7, 2, 3], dtype=np.int64))
+    assert_eq(sr, psr, check_dtype=True)
+
+
 @pytest.mark.parametrize(
     "df",
     [
@@ -297,3 +307,48 @@ def test_series_slice_setitem_struct():
     actual[0:3] = cudf.Scalar({"a": {"b": 5050}, "b": 101})
 
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
+@pytest.mark.parametrize("indices", [0, [1, 2]])
+def test_series_setitem_upcasting(dtype, indices):
+    sr = pd.Series([0, 0, 0], dtype=dtype)
+    cr = cudf.from_pandas(sr)
+    assert_eq(sr, cr)
+    # Must be a non-integral floating point value that can't be losslessly
+    # converted to float32, otherwise pandas will try and match the source
+    # column dtype.
+    new_value = np.float64(np.pi)
+    col_ref = cr._column
+    sr[indices] = new_value
+    cr[indices] = new_value
+    if PANDAS_GE_150:
+        assert_eq(sr, cr)
+    else:
+        # pandas bug, incorrectly fails to upcast from float32 to float64
+        assert_eq(sr.values, cr.values)
+    if dtype == np.float64:
+        # no-op type cast should not modify backing column
+        assert col_ref == cr._column
+
+
+# TODO: these two tests could perhaps be changed once specifics of
+# pandas compat wrt upcasting are decided on; this is just baking in
+# status-quo.
+def test_series_setitem_upcasting_string_column():
+    sr = pd.Series([0, 0, 0], dtype=str)
+    cr = cudf.from_pandas(sr)
+    new_value = np.float64(10.5)
+    sr[0] = str(new_value)
+    cr[0] = new_value
+    assert_eq(sr, cr)
+
+
+def test_series_setitem_upcasting_string_value():
+    sr = cudf.Series([0, 0, 0], dtype=int)
+    # This is a distinction with pandas, which lets you instead make an
+    # object column with ["10", 0, 0]
+    sr[0] = "10"
+    assert_eq(pd.Series([10, 0, 0], dtype=int), sr)
+    with pytest.raises(ValueError):
+        sr[0] = "non-integer"
diff --git a/python/cudf/cudf/tests/test_spilling.py b/python/cudf/cudf/tests/test_spilling.py
new file mode 100644
index 00000000000..6f790600d92
--- /dev/null
+++ b/python/cudf/cudf/tests/test_spilling.py
@@ -0,0 +1,477 @@
+# Copyright (c) 2022, NVIDIA CORPORATION.
+
+import importlib
+import random
+import time
+import warnings
+from concurrent.futures import ThreadPoolExecutor
+from typing import Tuple
+
+import cupy
+import numpy as np
+import pandas
+import pandas.testing
+import pytest
+
+import rmm
+
+import cudf
+import cudf.core.buffer.spill_manager
+import cudf.options
+from cudf.core.abc import Serializable
+from cudf.core.buffer import (
+    Buffer,
+    acquire_spill_lock,
+    as_buffer,
+    get_spill_lock,
+)
+from cudf.core.buffer.spill_manager import (
+    SpillManager,
+    get_global_manager,
+    get_rmm_memory_resource_stack,
+    set_global_manager,
+)
+from cudf.core.buffer.spillable_buffer import (
+    SpillableBuffer,
+    SpillableBufferSlice,
+    SpillLock,
+)
+from cudf.testing._utils import assert_eq
+
+if get_global_manager() is not None:
+    pytest.skip(
+        "cannot test spilling when enabled globally, set `CUDF_SPILL=off`",
+        allow_module_level=True,
+    )
+
+
+def gen_df(target="gpu") -> cudf.DataFrame:
+    ret = cudf.DataFrame({"a": [1, 2, 3]})
+    if target != "gpu":
+        gen_df.buffer(ret).spill(target=target)
+    return ret
+
+
+gen_df.buffer = lambda df: df._data._data["a"].data
+gen_df.is_spilled = lambda df: gen_df.buffer(df).is_spilled
+gen_df.is_spillable = lambda df: gen_df.buffer(df).spillable
+gen_df.buffer_size = gen_df.buffer(gen_df()).size
+
+
+def spilled_and_unspilled(manager: SpillManager) -> Tuple[int, int]:
+    """Get bytes spilled and unspilled known by the manager"""
+    spilled = sum(buf.size for buf in manager.buffers() if buf.is_spilled)
+    unspilled = sum(
+        buf.size for buf in manager.buffers() if not buf.is_spilled
+    )
+    return spilled, unspilled
+
+
+@pytest.fixture
+def manager(request):
+    """Fixture to enable and make a spilling manager availabe"""
+    kwargs = dict(getattr(request, "param", {}))
+    with warnings.catch_warnings():
+        warnings.simplefilter("error")
+        set_global_manager(manager=SpillManager(**kwargs))
+        yield get_global_manager()
+        # Retrieving the test result using the `pytest_runtest_makereport`
+        # hook from conftest.py
+        if request.node.report["call"].failed:
+            # Ignore `overwriting non-empty manager` errors when
+            # test is failing.
+            warnings.simplefilter("ignore")
+        set_global_manager(manager=None)
+
+
+def test_spillable_buffer(manager: SpillManager):
+    buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
+    assert isinstance(buf, SpillableBuffer)
+    assert buf.spillable
+    buf.ptr  # Expose pointer
+    assert buf.exposed
+    assert not buf.spillable
+    buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
+    # Notice, accessing `__cuda_array_interface__` itself doesn't
+    # expose the pointer, only accessing the "data" field exposes
+    # the pointer.
+    iface = buf.__cuda_array_interface__
+    assert not buf.exposed
+    assert buf.spillable
+    iface["data"][0]  # Expose pointer
+    assert buf.exposed
+    assert not buf.spillable
+
+
+@pytest.mark.parametrize(
+    "attribute",
+    [
+        "ptr",
+        "get_ptr",
+        "memoryview",
+        "is_spilled",
+        "exposed",
+        "spillable",
+        "spill_lock",
+        "spill",
+    ],
+)
+def test_spillable_buffer_view_attributes(manager: SpillManager, attribute):
+    base = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
+    view = base[:]
+    attr_base = getattr(base, attribute)
+    attr_view = getattr(view, attribute)
+    if callable(attr_view):
+        pass
+    else:
+        assert attr_base == attr_view
+
+
+def test_from_pandas(manager: SpillManager):
+    pdf1 = pandas.DataFrame({"x": [1, 2, 3]})
+    df = cudf.from_pandas(pdf1)
+    assert df._data._data["x"].data.spillable
+    pdf2 = df.to_pandas()
+    pandas.testing.assert_frame_equal(pdf1, pdf2)
+
+
+def test_creations(manager: SpillManager):
+    df = cudf.datasets.timeseries()
+    assert isinstance(df._data._data["x"].data, SpillableBuffer)
+    assert df._data._data["x"].data.spillable
+    df = cudf.DataFrame({"x": [1, 2, 3]})
+    assert df._data._data["x"].data.spillable
+    df = cudf.datasets.randomdata(10)
+    assert df._data._data["x"].data.spillable
+
+
+def test_spillable_df_groupby(manager: SpillManager):
+    df = cudf.DataFrame({"x": [1, 1, 1]})
+    gb = df.groupby("x")
+    assert len(df._data._data["x"].base_data._spill_locks) == 0
+    gb._groupby
+    # `gb._groupby`, which is cached on `gb`, holds a spill lock
+    assert len(df._data._data["x"].base_data._spill_locks) == 1
+    assert not df._data._data["x"].data.spillable
+    del gb
+    assert df._data._data["x"].data.spillable
+
+
+def test_spilling_buffer(manager: SpillManager):
+    buf = as_buffer(rmm.DeviceBuffer(size=10), exposed=False)
+    buf.spill(target="cpu")
+    assert buf.is_spilled
+    buf.ptr  # Expose pointer and trigger unspill
+    assert not buf.is_spilled
+    with pytest.raises(ValueError, match="unspillable buffer"):
+        buf.spill(target="cpu")
+
+
+def test_environment_variables(monkeypatch):
+    def reload_options():
+        # In order to enabling monkey patching of the environment variables
+        # mark the global manager as uninitialized.
+        set_global_manager(None)
+        cudf.core.buffer.spill_manager._global_manager_uninitialized = True
+        importlib.reload(cudf.options)
+
+    monkeypatch.setenv("CUDF_SPILL_ON_DEMAND", "off")
+    monkeypatch.setenv("CUDF_SPILL", "off")
+    reload_options()
+    assert get_global_manager() is None
+
+    monkeypatch.setenv("CUDF_SPILL", "on")
+    reload_options()
+    manager = get_global_manager()
+    assert isinstance(manager, SpillManager)
+    assert manager._spill_on_demand is False
+    assert manager._device_memory_limit is None
+
+    monkeypatch.setenv("CUDF_SPILL_DEVICE_LIMIT", "1000")
+    reload_options()
+    manager = get_global_manager()
+    assert isinstance(manager, SpillManager)
+    assert manager._device_memory_limit == 1000
+
+
+def test_spill_device_memory(manager: SpillManager):
+    df = gen_df()
+    assert spilled_and_unspilled(manager) == (0, gen_df.buffer_size)
+    manager.spill_device_memory(nbytes=1)
+    assert spilled_and_unspilled(manager) == (gen_df.buffer_size, 0)
+    del df
+    assert spilled_and_unspilled(manager) == (0, 0)
+    df1 = gen_df()
+    df2 = gen_df()
+    manager.spill_device_memory(nbytes=1)
+    assert gen_df.is_spilled(df1)
+    assert not gen_df.is_spilled(df2)
+    manager.spill_device_memory(nbytes=1)
+    assert gen_df.is_spilled(df1)
+    assert gen_df.is_spilled(df2)
+    df3 = df1 + df2
+    assert not gen_df.is_spilled(df1)
+    assert not gen_df.is_spilled(df2)
+    assert not gen_df.is_spilled(df3)
+    manager.spill_device_memory(nbytes=1)
+    assert gen_df.is_spilled(df1)
+    assert not gen_df.is_spilled(df2)
+    assert not gen_df.is_spilled(df3)
+    df2.abs()  # Should change the access time
+    manager.spill_device_memory(nbytes=1)
+    assert gen_df.is_spilled(df1)
+    assert not gen_df.is_spilled(df2)
+    assert gen_df.is_spilled(df3)
+
+
+def test_spill_to_device_limit(manager: SpillManager):
+    df1 = gen_df()
+    df2 = gen_df()
+    assert spilled_and_unspilled(manager) == (0, gen_df.buffer_size * 2)
+    manager.spill_to_device_limit(device_limit=0)
+    assert spilled_and_unspilled(manager) == (gen_df.buffer_size * 2, 0)
+    df3 = df1 + df2
+    manager.spill_to_device_limit(device_limit=0)
+    assert spilled_and_unspilled(manager) == (gen_df.buffer_size * 3, 0)
+    assert gen_df.is_spilled(df1)
+    assert gen_df.is_spilled(df2)
+    assert gen_df.is_spilled(df3)
+
+
+@pytest.mark.parametrize(
+    "manager", [{"device_memory_limit": 0}], indirect=True
+)
+def test_zero_device_limit(manager: SpillManager):
+    assert manager._device_memory_limit == 0
+    df1 = gen_df()
+    df2 = gen_df()
+    assert spilled_and_unspilled(manager) == (gen_df.buffer_size * 2, 0)
+    df1 + df2
+    # Notice, while performing the addintion both df1 and df2 are unspillable
+    assert spilled_and_unspilled(manager) == (0, gen_df.buffer_size * 2)
+    manager.spill_to_device_limit()
+    assert spilled_and_unspilled(manager) == (gen_df.buffer_size * 2, 0)
+
+
+def test_external_memory_never_spills(manager):
+    """
+    Test that external data, i.e., data not managed by RMM,
+    is never spilled
+    """
+
+    cupy.cuda.set_allocator()  # uses default allocator
+
+    a = cupy.asarray([1, 2, 3])
+    s = cudf.Series(a)
+    assert len(manager.buffers()) == 0
+    assert not s._data[None].data.spillable
+
+
+def test_spilling_df_views(manager):
+    df = gen_df(target="cpu")
+    assert gen_df.is_spilled(df)
+    df_view = df.loc[1:]
+    assert gen_df.is_spillable(df_view)
+    assert gen_df.is_spillable(df)
+
+
+def test_modify_spilled_views(manager):
+    df = gen_df()
+    df_view = df.iloc[1:]
+    buf = gen_df.buffer(df)
+    buf.spill(target="cpu")
+
+    # modify the spilled df and check that the changes are reflected
+    # in the view
+    df.iloc[1:] = 0
+    assert_eq(df_view, df.iloc[1:])
+
+    # now, modify the view and check that the changes are reflected in
+    # the df
+    df_view.iloc[:] = -1
+    assert_eq(df_view, df.iloc[1:])
+
+
+def test_ptr_restricted(manager: SpillManager):
+    buf = as_buffer(data=rmm.DeviceBuffer(size=10), exposed=False)
+    assert buf.spillable
+    assert len(buf._spill_locks) == 0
+    slock1 = SpillLock()
+    buf.get_ptr(spill_lock=slock1)
+    assert not buf.spillable
+    assert len(buf._spill_locks) == 1
+    slock2 = SpillLock()
+    buf.spill_lock(spill_lock=slock2)
+    buf.get_ptr(spill_lock=slock2)
+    assert not buf.spillable
+    assert len(buf._spill_locks) == 2
+    del slock1
+    assert len(buf._spill_locks) == 1
+    del slock2
+    assert len(buf._spill_locks) == 0
+    assert buf.spillable
+
+
+def test_get_spill_lock(manager: SpillManager):
+    @acquire_spill_lock()
+    def f(sleep=False, nest=0):
+        if sleep:
+            time.sleep(random.random() / 100)
+        if nest:
+            return f(nest=nest - 1)
+        return get_spill_lock()
+
+    assert get_spill_lock() is None
+    slock = f()
+    assert isinstance(slock, SpillLock)
+    assert get_spill_lock() is None
+    slock = f(nest=2)
+    assert isinstance(slock, SpillLock)
+    assert get_spill_lock() is None
+
+    with ThreadPoolExecutor(max_workers=2) as executor:
+        futures_with_spill_lock = []
+        futures_without_spill_lock = []
+        for _ in range(100):
+            futures_with_spill_lock.append(
+                executor.submit(f, sleep=True, nest=1)
+            )
+            futures_without_spill_lock.append(
+                executor.submit(f, sleep=True, nest=1)
+            )
+        all(isinstance(f.result(), SpillLock) for f in futures_with_spill_lock)
+        all(f is None for f in futures_without_spill_lock)
+
+
+def test_get_spill_lock_no_manager():
+    """When spilling is disabled, get_spill_lock() should return None always"""
+
+    @acquire_spill_lock()
+    def f():
+        return get_spill_lock()
+
+    assert get_spill_lock() is None
+    assert f() is None
+
+
+@pytest.mark.parametrize("target", ["gpu", "cpu"])
+@pytest.mark.parametrize("view", [None, slice(0, 2), slice(1, 3)])
+def test_serialize_device(manager, target, view):
+    df1 = gen_df()
+    if view is not None:
+        df1 = df1.iloc[view]
+    gen_df.buffer(df1).spill(target=target)
+
+    header, frames = df1.device_serialize()
+    assert len(frames) == 1
+    if target == "gpu":
+        assert isinstance(frames[0], Buffer)
+        assert not gen_df.is_spilled(df1)
+        assert not gen_df.is_spillable(df1)
+        frames[0] = cupy.array(frames[0], copy=True)
+    else:
+        assert isinstance(frames[0], memoryview)
+        assert gen_df.is_spilled(df1)
+        assert gen_df.is_spillable(df1)
+
+    df2 = Serializable.device_deserialize(header, frames)
+    assert_eq(df1, df2)
+
+
+@pytest.mark.parametrize("target", ["gpu", "cpu"])
+@pytest.mark.parametrize("view", [None, slice(0, 2), slice(1, 3)])
+def test_serialize_host(manager, target, view):
+    df1 = gen_df()
+    if view is not None:
+        df1 = df1.iloc[view]
+    gen_df.buffer(df1).spill(target=target)
+
+    # Unspilled df becomes spilled after host serialization
+    header, frames = df1.host_serialize()
+    assert all(isinstance(f, memoryview) for f in frames)
+    df2 = Serializable.host_deserialize(header, frames)
+    assert gen_df.is_spilled(df2)
+    assert_eq(df1, df2)
+
+
+def test_serialize_dask_dataframe(manager: SpillManager):
+    protocol = pytest.importorskip("distributed.protocol")
+
+    df1 = gen_df(target="gpu")
+    header, frames = protocol.serialize(
+        df1, serializers=("dask",), on_error="raise"
+    )
+    buf: SpillableBuffer = gen_df.buffer(df1)
+    assert len(frames) == 1
+    assert isinstance(frames[0], memoryview)
+    # Check that the memoryview and frames is the same memory
+    assert (
+        np.array(buf.memoryview()).__array_interface__["data"]
+        == np.array(frames[0]).__array_interface__["data"]
+    )
+
+    df2 = protocol.deserialize(header, frames)
+    assert gen_df.is_spilled(df2)
+    assert_eq(df1, df2)
+
+
+def test_serialize_cuda_dataframe(manager: SpillManager):
+    protocol = pytest.importorskip("distributed.protocol")
+
+    df1 = gen_df(target="gpu")
+    header, frames = protocol.serialize(
+        df1, serializers=("cuda",), on_error="raise"
+    )
+    buf: SpillableBufferSlice = gen_df.buffer(df1)
+    assert len(buf._base._spill_locks) == 1
+    assert len(frames) == 1
+    assert isinstance(frames[0], Buffer)
+    assert frames[0].ptr == buf.ptr
+
+    frames[0] = cupy.array(frames[0], copy=True)
+    df2 = protocol.deserialize(header, frames)
+    assert_eq(df1, df2)
+
+
+def test_get_rmm_memory_resource_stack():
+    mr1 = rmm.mr.get_current_device_resource()
+    assert all(
+        not isinstance(m, rmm.mr.FailureCallbackResourceAdaptor)
+        for m in get_rmm_memory_resource_stack(mr1)
+    )
+
+    mr2 = rmm.mr.FailureCallbackResourceAdaptor(mr1, lambda x: False)
+    assert get_rmm_memory_resource_stack(mr2)[0] is mr2
+    assert get_rmm_memory_resource_stack(mr2)[1] is mr1
+
+    mr3 = rmm.mr.FixedSizeMemoryResource(mr2)
+    assert get_rmm_memory_resource_stack(mr3)[0] is mr3
+    assert get_rmm_memory_resource_stack(mr3)[1] is mr2
+    assert get_rmm_memory_resource_stack(mr3)[2] is mr1
+
+    mr4 = rmm.mr.FailureCallbackResourceAdaptor(mr3, lambda x: False)
+    assert get_rmm_memory_resource_stack(mr4)[0] is mr4
+    assert get_rmm_memory_resource_stack(mr4)[1] is mr3
+    assert get_rmm_memory_resource_stack(mr4)[2] is mr2
+    assert get_rmm_memory_resource_stack(mr4)[3] is mr1
+
+
+def test_df_transpose(manager: SpillManager):
+    df1 = cudf.DataFrame({"x": [1, 2]})
+    df2 = df1.transpose()
+    # For now, all buffers are marked as exposed
+    assert df1._data._data["x"].data.exposed
+    assert df2._data._data[0].data.exposed
+    assert df2._data._data[1].data.exposed
+
+
+@pytest.mark.parametrize("dtype", ["uint8", "uint64"])
+def test_memoryview_slice(manager: SpillManager, dtype):
+    """Check .memoryview() of a sliced spillable buffer"""
+
+    data = np.arange(10, dtype=dtype)
+    # memoryview of a sliced spillable buffer
+    m1 = as_buffer(data=data)[1:-1].memoryview()
+    # sliced memoryview of data as bytes
+    m2 = memoryview(data).cast("B")[1:-1]
+    assert m1 == m2
diff --git a/python/cudf/cudf/tests/test_string.py b/python/cudf/cudf/tests/test_string.py
index 74d602c2cf1..2a43adf5a5c 100644
--- a/python/cudf/cudf/tests/test_string.py
+++ b/python/cudf/cudf/tests/test_string.py
@@ -3423,3 +3423,64 @@ def test_str_join_lists(sr, sep, string_na_rep, sep_na_rep, expected):
         sep=sep, string_na_rep=string_na_rep, sep_na_rep=sep_na_rep
     )
     assert_eq(actual, expected)
+
+
+@pytest.mark.parametrize(
+    "patterns, expected",
+    [
+        (
+            lambda: ["a", "s", "g", "i", "o", "r"],
+            [
+                [-1, 0, 5, 3, -1, 2],
+                [-1, -1, -1, -1, 1, -1],
+                [2, 0, -1, -1, -1, 3],
+                [-1, -1, -1, 0, -1, -1],
+            ],
+        ),
+        (
+            lambda: cudf.Series(["a", "string", "g", "inn", "o", "r", "sea"]),
+            [
+                [-1, 0, 5, -1, -1, 2, -1],
+                [-1, -1, -1, -1, 1, -1, -1],
+                [2, -1, -1, -1, -1, 3, 0],
+                [-1, -1, -1, -1, -1, -1, -1],
+            ],
+        ),
+    ],
+)
+def test_str_find_multiple(patterns, expected):
+    s = cudf.Series(["strings", "to", "search", "in"])
+    t = patterns()
+
+    expected = cudf.Series(expected)
+
+    # We convert to pandas because find_multiple returns ListDtype(int32)
+    # and expected is ListDtype(int64).
+    # Currently there is no easy way to type-cast these to match.
+    assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas())
+
+    s = cudf.Index(s)
+    t = cudf.Index(t)
+
+    expected.index = s
+
+    assert_eq(s.str.find_multiple(t).to_pandas(), expected.to_pandas())
+
+
+def test_str_find_multiple_error():
+    s = cudf.Series(["strings", "to", "search", "in"])
+    with pytest.raises(
+        TypeError,
+        match=re.escape(
+            "patterns should be an array-like or a Series object, found "
+            "<class 'str'>"
+        ),
+    ):
+        s.str.find_multiple("a")
+
+    t = cudf.Series([1, 2, 3])
+    with pytest.raises(
+        TypeError,
+        match=re.escape("patterns can only be of 'string' dtype, got: int64"),
+    ):
+        s.str.find_multiple(t)
diff --git a/python/cudf/cudf/tests/test_struct.py b/python/cudf/cudf/tests/test_struct.py
index 4c70d20c488..eaee1efcbc8 100644
--- a/python/cudf/cudf/tests/test_struct.py
+++ b/python/cudf/cudf/tests/test_struct.py
@@ -371,3 +371,24 @@ def test_nested_struct_extract_host_scalars(data, idx, expected):
     series = cudf.Series(data)
 
     assert _nested_na_replace(series[idx]) == _nested_na_replace(expected)
+
+
+def test_struct_memory_usage():
+    s = cudf.Series([{"a": 1, "b": 10}, {"a": 2, "b": 20}, {"a": 3, "b": 30}])
+    df = s.struct.explode()
+
+    assert_eq(s.memory_usage(), df.memory_usage().sum())
+
+
+def test_struct_with_null_memory_usage():
+    df = cudf.DataFrame(
+        {
+            "a": cudf.Series([1, 2, -1, -1, 3], dtype="int64"),
+            "b": cudf.Series([10, 20, -1, -1, 30], dtype="int64"),
+        }
+    )
+    s = df.to_struct()
+    assert s.memory_usage() == 80
+
+    s[2:4] = None
+    assert s.memory_usage() == 272
diff --git a/python/cudf/cudf/tests/test_testing.py b/python/cudf/cudf/tests/test_testing.py
index 60f01d567ef..c3dfeac9a3f 100644
--- a/python/cudf/cudf/tests/test_testing.py
+++ b/python/cudf/cudf/tests/test_testing.py
@@ -429,7 +429,7 @@ def test_assert_column_memory_slice(arrow_arrays):
 
 def test_assert_column_memory_basic_same(arrow_arrays):
     data = cudf.core.column.ColumnBase.from_arrow(arrow_arrays)
-    buf = cudf.core.buffer.as_device_buffer_like(data.base_data)
+    buf = cudf.core.buffer.as_buffer(data.base_data)
 
     left = cudf.core.column.build_column(buf, dtype=np.int32)
     right = cudf.core.column.build_column(buf, dtype=np.int32)
diff --git a/python/cudf/cudf/tests/test_text.py b/python/cudf/cudf/tests/test_text.py
index a4edaeff545..627bf0a68bb 100644
--- a/python/cudf/cudf/tests/test_text.py
+++ b/python/cudf/cudf/tests/test_text.py
@@ -827,14 +827,20 @@ def test_read_text_byte_range(datadir):
 
 
 def test_read_text_byte_range_large(tmpdir):
-    content = str([["\n" if x % 5 == 0 else "x"] for x in range(0, 3000)])
-    delimiter = "1."
+    content = "".join(("\n" if x % 5 == 4 else "x") for x in range(0, 3000))
+    delimiter = "\n"
     temp_file = str(tmpdir) + "/temp.txt"
 
     with open(temp_file, "w") as f:
         f.write(content)
 
-    cudf.read_text(temp_file, delimiter=delimiter)
+    expected = cudf.Series(["xxxx\n" for i in range(0, 200)])
+
+    actual = cudf.read_text(
+        temp_file, delimiter=delimiter, byte_range=[1000, 1000]
+    )
+
+    assert_eq(expected, actual)
 
 
 def test_read_text_in_memory(datadir):
@@ -845,3 +851,66 @@ def test_read_text_in_memory(datadir):
     actual = cudf.read_text(StringIO("x::y::z"), delimiter="::")
 
     assert_eq(expected, actual)
+
+
+def test_read_text_in_memory_strip_delimiter(datadir):
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(["x", "y", "z"])
+
+    actual = cudf.read_text(
+        StringIO("x::y::z"), delimiter="::", strip_delimiters=True
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_bgzip(datadir):
+    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read().split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(
+        chess_file_compressed, compression="bgzip", delimiter=delimiter
+    )
+
+    assert_eq(expected, actual)
+
+
+def test_read_text_bgzip_offsets(datadir):
+    chess_file_compressed = str(datadir) + "/chess.pgn.gz"
+    chess_file = str(datadir) + "/chess.pgn"
+    delimiter = "1."
+
+    with open(chess_file) as f:
+        content = f.read()[29:695].split(delimiter)
+
+    # Since Python split removes the delimiter and read_text does
+    # not we need to add it back to the 'content'
+    expected = cudf.Series(
+        [
+            c + delimiter if i < (len(content) - 1) else c
+            for i, c in enumerate(content)
+        ]
+    )
+
+    actual = cudf.read_text(
+        chess_file_compressed,
+        compression="bgzip",
+        compression_offsets=[58 * 2**16 + 2, 781 * 2**16 + 7],
+        delimiter=delimiter,
+    )
+
+    assert_eq(expected, actual)
diff --git a/python/cudf/cudf/tests/test_timedelta.py b/python/cudf/cudf/tests/test_timedelta.py
index 23270875a92..c1b603e34f2 100644
--- a/python/cudf/cudf/tests/test_timedelta.py
+++ b/python/cudf/cudf/tests/test_timedelta.py
@@ -400,12 +400,7 @@ def test_timedelta_dataframe_ops(df, op):
         [1],
         [12, 11, 232, 223432411, 2343241, 234324, 23234],
         [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234],
-        pytest.param(
-            [1.321, 1132.324, 23223231.11, 233.41, 0.2434, 332, 323],
-            marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/5938"
-            ),
-        ),
+        [1.321, 1132.324, 23223231.11, 233.41, 332, 323],
         [12, 11, 2.32, 2234.32411, 2343.241, 23432.4, 23234],
     ],
 )
@@ -492,6 +487,36 @@ def test_timedelta_series_ops_with_scalars(data, other_scalars, dtype, op):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "reverse",
+    [
+        False,
+        pytest.param(
+            True,
+            marks=pytest.mark.xfail(
+                strict=True,
+                reason=(
+                    "timedelta modulo by zero is dubiously defined in "
+                    "both pandas and cuDF "
+                    "(see https://github.com/rapidsai/cudf/issues/5938)"
+                ),
+            ),
+        ),
+    ],
+)
+def test_timedelta_series_mod_with_scalar_zero(reverse):
+    gsr = cudf.Series(data=[0.2434], dtype=np.timedelta64(1, "ns"))
+    psr = gsr.to_pandas()
+    scalar = datetime.timedelta(days=768)
+    if reverse:
+        expected = scalar % psr
+        actual = scalar % gsr
+    else:
+        expected = psr % scalar
+        actual = gsr % scalar
+    assert_eq(expected, actual)
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -597,6 +622,37 @@ def test_timedelta_series_ops_with_cudf_scalars(data, cpu_scalar, dtype, op):
     assert_eq(expected, actual)
 
 
+@pytest.mark.parametrize(
+    "reverse",
+    [
+        False,
+        pytest.param(
+            True,
+            marks=pytest.mark.xfail(
+                strict=True,
+                reason=(
+                    "timedelta modulo by zero is dubiously defined in "
+                    "both pandas and cuDF "
+                    "(see https://github.com/rapidsai/cudf/issues/5938)"
+                ),
+            ),
+        ),
+    ],
+)
+def test_timedelta_series_mod_with_cudf_scalar_zero(reverse):
+    gsr = cudf.Series(data=[0.2434], dtype=np.timedelta64(1, "ns"))
+    psr = gsr.to_pandas()
+    scalar = datetime.timedelta(days=768)
+    gpu_scalar = cudf.Scalar(scalar)
+    if reverse:
+        expected = scalar % psr
+        actual = gpu_scalar % gsr
+    else:
+        expected = psr % scalar
+        actual = gsr % gpu_scalar
+    assert_eq(expected, actual)
+
+
 @pytest.mark.parametrize(
     "data",
     [
@@ -812,7 +868,8 @@ def test_timedelta_datetime_index_ops_misc(
         pytest.param(
             "floordiv",
             marks=pytest.mark.xfail(
-                reason="https://github.com/pandas-dev/pandas/issues/35529"
+                condition=not PANDAS_GE_120,
+                reason="https://github.com/pandas-dev/pandas/issues/35529",
             ),
         ),
     ],
@@ -850,7 +907,35 @@ def test_timedelta_index_ops_with_scalars(data, other_scalars, dtype, op):
         expected = other_scalars // ptdi
         actual = other_scalars // gtdi
 
-    assert_eq(expected, actual)
+    if op == "floordiv":
+        # Hand-coding pytest.xfail behaviour for certain combinations
+        if (
+            0 in ptdi.astype("int")
+            and np.timedelta64(other_scalars).item() is not None
+        ):
+            with pytest.raises(AssertionError):
+                # Related to https://github.com/rapidsai/cudf/issues/5938
+                #
+                # Division by zero for datetime or timedelta is
+                # dubiously defined in both pandas (Any // 0 -> 0 in
+                # pandas) and cuDF (undefined behaviour)
+                assert_eq(expected, actual)
+        elif (
+            (None not in ptdi)
+            and np.nan not in expected
+            and (
+                expected.astype("float64").astype("int64")
+                != expected.astype("int64")
+            ).any()
+        ):
+            with pytest.raises(AssertionError):
+                # Incorrect implementation of floordiv in cuDF:
+                # https://github.com/rapidsai/cudf/issues/12120
+                assert_eq(expected, actual)
+        else:
+            assert_eq(expected, actual)
+    else:
+        assert_eq(expected, actual)
 
 
 @pytest.mark.parametrize("data", _TIMEDELTA_DATA_NON_OVERFLOW)
@@ -876,12 +961,12 @@ def test_timedelta_index_ops_with_scalars(data, other_scalars, dtype, op):
         pytest.param(
             "floordiv",
             marks=pytest.mark.xfail(
-                reason="https://github.com/rapidsai/cudf/issues/5938"
+                condition=not PANDAS_GE_120,
+                reason="https://github.com/pandas-dev/pandas/issues/35529",
             ),
         ),
     ],
 )
-@pytest.mark.filterwarnings("ignore:divide by zero:RuntimeWarning:pandas")
 def test_timedelta_index_ops_with_cudf_scalars(data, cpu_scalar, dtype, op):
     gtdi = cudf.Index(data=data, dtype=dtype)
     ptdi = gtdi.to_pandas()
@@ -916,7 +1001,35 @@ def test_timedelta_index_ops_with_cudf_scalars(data, cpu_scalar, dtype, op):
         expected = cpu_scalar // ptdi
         actual = gpu_scalar // gtdi
 
-    assert_eq(expected, actual)
+    if op == "floordiv":
+        # Hand-coding pytest.xfail behaviour for certain combinations
+        if (
+            0 in ptdi.astype("int")
+            and np.timedelta64(cpu_scalar).item() is not None
+        ):
+            with pytest.raises(AssertionError):
+                # Related to https://github.com/rapidsai/cudf/issues/5938
+                #
+                # Division by zero for datetime or timedelta is
+                # dubiously defined in both pandas (Any // 0 -> 0 in
+                # pandas) and cuDF (undefined behaviour)
+                assert_eq(expected, actual)
+        elif (
+            (None not in ptdi)
+            and np.nan not in expected
+            and (
+                expected.astype("float64").astype("int64")
+                != expected.astype("int64")
+            ).any()
+        ):
+            with pytest.raises(AssertionError):
+                # Incorrect implementation of floordiv in cuDF:
+                # https://github.com/rapidsai/cudf/issues/12120
+                assert_eq(expected, actual)
+        else:
+            assert_eq(expected, actual)
+    else:
+        assert_eq(expected, actual)
 
 
 @pytest.mark.parametrize("data", _TIMEDELTA_DATA)
diff --git a/python/cudf/cudf/tests/test_udf_masked_ops.py b/python/cudf/cudf/tests/test_udf_masked_ops.py
index 20245bd2a20..72abc8e9f87 100644
--- a/python/cudf/cudf/tests/test_udf_masked_ops.py
+++ b/python/cudf/cudf/tests/test_udf_masked_ops.py
@@ -197,6 +197,10 @@ def func(row):
         operator.pow,
         operator.truediv,
         operator.floordiv,
+        operator.imod,
+        operator.ipow,
+        operator.itruediv,
+        operator.ifloordiv,
     }:
         # The following tests cases yield undefined behavior:
         # - truediv(x, False) because its dividing by zero
@@ -219,7 +223,7 @@ def func(row):
     # Just a single column -> result will be all NA
     gdf = cudf.DataFrame({"data": data})
 
-    if constant == 1 and op is operator.pow:
+    if constant == 1 and op in {operator.pow, operator.ipow}:
         # The following tests cases yield differing results from pandas:
         # - 1**NA
         # - True**NA
@@ -237,7 +241,7 @@ def func(row):
 
     gdf = cudf.DataFrame({"data": data})
 
-    if 1 in gdf["data"] and op is operator.pow:
+    if 1 in gdf["data"] and op in {operator.pow, operator.ipow}:
         # In pandas, 1**NA == 1.
         pytest.skip()
     run_masked_udf_test(func, gdf, check_dtype=False)
@@ -483,7 +487,7 @@ def func(x):
 
     # Just a single column -> result will be all NA
     data = cudf.Series([1, 2, cudf.NA])
-    if constant is cudf.NA and op is operator.pow:
+    if constant is cudf.NA and op in {operator.pow, operator.ipow}:
         # in pandas, 1**NA == 1. In cudf, 1**NA == 1.
         with pytest.xfail():
             run_masked_udf_series(func, data, check_dtype=False)
@@ -499,7 +503,11 @@ def func(x):
 
     # Just a single column -> result will be all NA
     data = cudf.Series([1, 2, cudf.NA])
-    if constant is not cudf.NA and constant == 1 and op is operator.pow:
+    if (
+        constant is not cudf.NA
+        and constant == 1
+        and op in {operator.pow, operator.ipow}
+    ):
         # in pandas, 1**NA == 1. In cudf, 1**NA == 1.
         with pytest.xfail():
             run_masked_udf_series(func, data, check_dtype=False)
@@ -860,6 +868,66 @@ def func(row):
     run_masked_udf_test(func, str_udf_data, check_dtype=False)
 
 
+@string_udf_test
+def test_string_udf_return_string(str_udf_data):
+    def func(row):
+        return row["str_col"]
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
+@string_udf_test
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_strip(str_udf_data, strip_char):
+    def func(row):
+        return row["str_col"].strip(strip_char)
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
+@string_udf_test
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_lstrip(str_udf_data, strip_char):
+    def func(row):
+        return row["str_col"].lstrip(strip_char)
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
+@string_udf_test
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_rstrip(str_udf_data, strip_char):
+    def func(row):
+        return row["str_col"].rstrip(strip_char)
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
+@string_udf_test
+def test_string_udf_upper(str_udf_data):
+    def func(row):
+        return row["str_col"].upper()
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
+@string_udf_test
+def test_string_udf_lower(str_udf_data):
+    def func(row):
+        return row["str_col"].lower()
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
+@string_udf_test
+@pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_concat(str_udf_data, concat_char):
+    def func(row):
+        return row["str_col"] + concat_char
+
+    run_masked_udf_test(func, str_udf_data, check_dtype=False)
+
+
 @pytest.mark.parametrize(
     "data", [[1.0, 0.0, 1.5], [1, 0, 2], [True, False, True]]
 )
diff --git a/python/cudf/cudf/utils/docutils.py b/python/cudf/cudf/utils/docutils.py
index 9f04e30fb28..09f0eb05eb6 100644
--- a/python/cudf/cudf/utils/docutils.py
+++ b/python/cudf/cudf/utils/docutils.py
@@ -83,7 +83,7 @@ def wrapper(func):
         Generate descriptive statistics.
 
         Descriptive statistics include those that summarize the
-        central tendency, dispersion and shape of a dataset’s
+        central tendency, dispersion and shape of a dataset's
         distribution, excluding ``NaN`` values.
 
         Analyzes both numeric and object series, as well as
diff --git a/python/cudf/cudf/utils/hash_vocab_utils.py b/python/cudf/cudf/utils/hash_vocab_utils.py
index cecf0c36bc2..a0915951240 100644
--- a/python/cudf/cudf/utils/hash_vocab_utils.py
+++ b/python/cudf/cudf/utils/hash_vocab_utils.py
@@ -253,9 +253,10 @@ def hash_vocab(
 
     hashed_vocab = {_sdbm_hash(key): value for key, value in vocab.items()}
 
-    error_message = """Collision occurred and only sdbm token hash current supported :(
-      Can be extended to use random hashes if needed"""
-
+    error_message = (
+        "A collision occurred and only sdbm token hash is currently "
+        "supported. This can be extended to use random hashes if needed."
+    )
     assert len(hashed_vocab) == len(vocab), error_message
 
     (
diff --git a/python/cudf/cudf/utils/ioutils.py b/python/cudf/cudf/utils/ioutils.py
index fe65b8f22fc..96d4ea891b1 100644
--- a/python/cudf/cudf/utils/ioutils.py
+++ b/python/cudf/cudf/utils/ioutils.py
@@ -25,6 +25,9 @@
     fsspec_parquet = None
 
 
+_BYTES_PER_THREAD_DEFAULT = 256 * 1024 * 1024
+_ROW_GROUP_SIZE_BYTES_DEFAULT = 128 * 1024 * 1024
+
 _docstring_remote_sources = """
 - cuDF supports local and remote data stores. See configuration details for
   available sources
@@ -43,12 +46,20 @@
     (such as builtin `open()` file handler function or `BytesIO`).
 engine : ['cudf'], default 'cudf'
     Parser engine to use.
+    This parameter is deprecated.
 columns : list, default None
     If not None, only these columns will be read.
 skiprows : int, default None
     If not None, the number of rows to skip from the start of the file.
 num_rows : int, default None
     If not None, the total number of rows to read.
+storage_options : dict, optional, default None
+    Extra options that make sense for a particular storage connection,
+    e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
+    pairs are forwarded to ``urllib.request.Request`` as header options.
+    For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
+    pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
+    ``urllib`` for more details.
 
 Returns
 -------
@@ -132,6 +143,13 @@
     Parser engine to use.
 columns : list, default None
     If not None, only these columns will be read.
+storage_options : dict, optional, default None
+    Extra options that make sense for a particular storage connection,
+    e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
+    pairs are forwarded to ``urllib.request.Request`` as header options.
+    For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
+    pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
+    ``urllib`` for more details.
 filters : list of tuple, list of lists of tuples default None
     If not None, specifies a filter predicate used to filter out row groups
     using statistics stored for each row group as Parquet metadata. Row groups
@@ -170,6 +188,13 @@
     deactivate optimized precaching, set the "method" to `None` under the
     "precache_options" key. Note that the `open_file_func` key can also be
     used to specify a custom file-open function.
+bytes_per_thread : int, default None
+    Determines the number of bytes to be allocated per thread to read the
+    files in parallel. When there is a file of large size, we get slightly
+    better throughput by decomposing it and transferring multiple "blocks"
+    in parallel (using a python thread pool). Default allocation is
+    {bytes_per_thread} bytes.
+    This parameter is functional only when `use_python_file_object=False`.
 
 Returns
 -------
@@ -195,7 +220,8 @@
 cudf.DataFrame.to_parquet
 cudf.read_orc
 """.format(
-    remote_data_sources=_docstring_remote_sources
+    remote_data_sources=_docstring_remote_sources,
+    bytes_per_thread=_BYTES_PER_THREAD_DEFAULT,
 )
 doc_read_parquet = docfmt_partial(docstring=_docstring_read_parquet)
 
@@ -208,14 +234,15 @@
     File path or Root Directory path. Will be used as Root Directory path
     while writing a partitioned dataset. Use list of str with partition_offsets
     to write parts of the dataframe to different files.
-compression : {'snappy', 'ZSTD', None}, default 'snappy'
+compression : {{'snappy', 'ZSTD', None}}, default 'snappy'
     Name of the compression to use. Use ``None`` for no compression.
 index : bool, default None
-    If ``True``, include the dataframe's index(es) in the file output. If
-    ``False``, they will not be written to the file. If ``None``, the
-    engine's default behavior will be used. However, instead of being saved
-    as values, the ``RangeIndex`` will be stored as a range in the metadata
-    so it doesn’t require much space and is faster. Other indexes will
+    If ``True``, include the dataframe's index(es) in the file output.
+    If ``False``, they will not be written to the file.
+    If ``None``, similar to ``True`` the dataframe's index(es) will
+    be saved, however, instead of being saved as values any
+    ``RangeIndex`` will be stored as a range in the metadata so it
+    doesn't require much space and is faster. Other indexes will
     be included as columns in the file output.
 partition_cols : list, optional, default None
     Column names by which to partition the dataset
@@ -228,7 +255,7 @@
 partition_offsets : list, optional, default None
     Offsets to partition the dataframe by. Should be used when path is list
     of str. Should be a list of integers of size ``len(path) + 1``
-statistics : {'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}, default 'ROWGROUP'
+statistics : {{'ROWGROUP', 'PAGE', 'COLUMN', 'NONE'}}, default 'ROWGROUP'
     Level at which column statistics should be included in file.
 metadata_file_path : str, optional, default None
     If specified, this function will return a binary blob containing the footer
@@ -239,11 +266,12 @@
     If ``True``, write timestamps in int96 format. This will convert
     timestamps from timestamp[ns], timestamp[ms], timestamp[s], and
     timestamp[us] to the int96 format, which is the number of Julian
-    days and the number of nanoseconds since midnight. If ``False``,
-    timestamps will not be altered.
-row_group_size_bytes: integer or None, default None
+    days and the number of nanoseconds since midnight of 1970-01-01.
+    If ``False``, timestamps will not be altered.
+row_group_size_bytes: integer, default {row_group_size_bytes_val}
     Maximum size of each stripe of the output.
-    If None, 134217728 (128MB) will be used.
+    If None, {row_group_size_bytes_val}
+    ({row_group_size_bytes_val_in_mb} MB) will be used.
 row_group_size_rows: integer or None, default None
     Maximum number of rows of each stripe of the output.
     If None, 1000000 will be used.
@@ -253,15 +281,30 @@
 max_page_size_rows: integer or None, default None
     Maximum number of rows of each page of the output.
     If None, 20000 will be used.
-**kwargs
+storage_options : dict, optional, default None
+    Extra options that make sense for a particular storage connection,
+    e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
+    pairs are forwarded to ``urllib.request.Request`` as header options.
+    For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
+    pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
+    ``urllib`` for more details.
+return_metadata : bool, default False
+    Return parquet metadata for written data. Returned metadata will
+    include the file path metadata (relative to `root_path`).
     To request metadata binary blob when using with ``partition_cols``, Pass
     ``return_metadata=True`` instead of specifying ``metadata_file_path``
+**kwargs
+    Additional parameters will be passed to execution engines other
+    than ``cudf``.
 
 
 See Also
 --------
 cudf.read_parquet
-"""
+""".format(
+    row_group_size_bytes_val=_ROW_GROUP_SIZE_BYTES_DEFAULT,
+    row_group_size_bytes_val_in_mb=_ROW_GROUP_SIZE_BYTES_DEFAULT / 1024 / 1024,
+)
 doc_to_parquet = docfmt_partial(docstring=_docstring_to_parquet)
 
 _docstring_merge_parquet_filemetadata = """
@@ -301,8 +344,6 @@
 
 Notes
 -----
-Support for reading files with struct columns is currently experimental,
-the output may not be as reliable as reading for other datatypes.
 {remote_data_sources}
 
 Examples
@@ -394,7 +435,20 @@
     If True, Arrow-backed PythonFile objects will be used in place of fsspec
     AbstractBufferedFile objects at IO time. This option is likely to improve
     performance when making small reads from larger ORC files.
-kwargs are passed to the engine
+storage_options : dict, optional, default None
+    Extra options that make sense for a particular storage connection,
+    e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
+    pairs are forwarded to ``urllib.request.Request`` as header options.
+    For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
+    pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
+    ``urllib`` for more details.
+bytes_per_thread : int, default None
+    Determines the number of bytes to be allocated per thread to read the
+    files in parallel. When there is a file of large size, we get slightly
+    better throughput by decomposing it and transferring multiple "blocks"
+    in parallel (using a python thread pool). Default allocation is
+    {bytes_per_thread} bytes.
+    This parameter is functional only when `use_python_file_object=False`.
 
 Returns
 -------
@@ -418,7 +472,8 @@
 --------
 cudf.DataFrame.to_orc
 """.format(
-    remote_data_sources=_docstring_remote_sources
+    remote_data_sources=_docstring_remote_sources,
+    bytes_per_thread=_BYTES_PER_THREAD_DEFAULT,
 )
 doc_read_orc = docfmt_partial(docstring=_docstring_read_orc)
 
@@ -431,8 +486,9 @@
     File path or object where the ORC dataset will be stored.
 compression : {{ 'snappy', 'ZSTD', None }}, default 'snappy'
     Name of the compression to use. Use None for no compression.
-enable_statistics: boolean, default True
-    Enable writing column statistics.
+statistics: str {{ "ROWGROUP", "STRIPE", None }}, default "ROWGROUP"
+    The granularity with which column statistics must
+    be written to the file.
 stripe_size_bytes: integer or None, default None
     Maximum size of each stripe of the output.
     If None, 67108864 (64MB) will be used.
@@ -446,11 +502,21 @@
     A list of column names which should be written as map type in the ORC file.
     Note that this option only affects columns of ListDtype. Names of other
     column types will be ignored.
-
-Notes
------
-Support for writing tables with struct columns is currently experimental,
-the output may not be as reliable as writing for other datatypes.
+storage_options : dict, optional, default None
+    Extra options that make sense for a particular storage connection,
+    e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
+    pairs are forwarded to ``urllib.request.Request`` as header options.
+    For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
+    pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
+    ``urllib`` for more details.
+index : bool, default None
+    If ``True``, include the dataframe's index(es) in the file output.
+    If ``False``, they will not be written to the file.
+    If ``None``, similar to ``True`` the dataframe's index(es) will
+    be saved, however, instead of being saved as values any
+    ``RangeIndex`` will be stored as a range in the metadata so it
+    doesn't require much space and is faster. Other indexes will
+    be included as columns in the file output.
 
 See Also
 --------
@@ -458,7 +524,7 @@
 """
 doc_to_orc = docfmt_partial(docstring=_docstring_to_orc)
 
-_docstring_read_json = """
+_docstring_read_json = r"""
 Load a JSON dataset into a DataFrame
 
 Parameters
@@ -473,8 +539,13 @@
 engine : {{ 'auto', 'cudf', 'cudf_experimental', 'pandas' }}, default 'auto'
     Parser engine to use. If 'auto' is passed, the engine will be
     automatically selected based on the other parameters.
-orient : string,
-    Indication of expected JSON string format (pandas engine only).
+orient : string
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    Indication of expected JSON string format.
     Compatible JSON strings can be produced by ``to_json()`` with a
     corresponding orient value.
     The set of possible orients is:
@@ -506,13 +577,25 @@
         ``'columns'``, and ``'records'``.
 typ : type of object to recover (series or frame), default 'frame'
     With cudf engine, only frame output is supported.
-dtype : boolean or dict, default True
-    If True, infer dtypes, if a dict of column to dtype, then use those,
-    if False, then don't infer dtypes at all, applies only to the data.
+dtype : boolean or dict, default None
+    If True, infer dtypes for all columns; if False, then don't infer dtypes at all,
+    if a dict, provide a mapping from column names to their respective dtype (any missing
+    columns will have their dtype inferred). Applies only to the data.
+    For all ``orient`` values except ``'table'``, default is ``True``.
 convert_axes : boolean, default True
-    Try to convert the axes to the proper dtypes (pandas engine only).
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    Try to convert the axes to the proper dtypes.
 convert_dates : boolean, default True
-    List of columns to parse for dates (pandas engine only); If True, then try
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    List of columns to parse for dates; If True, then try
     to parse datelike columns default is True; a column label is datelike if
 
     * it ends with ``'_at'``,
@@ -521,27 +604,57 @@
     * it is ``'modified'``, or
     * it is ``'date'``
 keep_default_dates : boolean, default True
-    If parsing dates, parse the default datelike columns (pandas engine only)
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    If parsing dates, parse the default datelike columns.
 numpy : boolean, default False
-    Direct decoding to numpy arrays (pandas engine only). Supports numeric
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    Direct decoding to numpy arrays. Supports numeric
     data only, but non-numeric column and index labels are supported. Note
     also that the JSON ordering MUST be the same for each term if numpy=True.
 precise_float : boolean, default False
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
     Set to enable usage of higher precision (strtod) function when
     decoding string to double values (pandas engine only). Default (False)
     is to use fast but less precise builtin functionality
 date_unit : string, default None
-    The timestamp unit to detect if converting dates (pandas engine only).
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    The timestamp unit to detect if converting dates.
     The default behavior is to try and detect the correct precision, but if
     this is not desired then pass one of 's', 'ms', 'us' or 'ns' to force
     parsing only seconds, milliseconds, microseconds or nanoseconds.
 encoding : str, default is 'utf-8'
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
     The encoding to use to decode py3 bytes.
     With cudf engine, only utf-8 is supported.
 lines : boolean, default False
     Read the file as a json object per line.
 chunksize : integer, default None
-    Return JsonReader object for iteration (pandas engine only).
+
+    .. admonition:: Not GPU-accelerated
+
+       This parameter is only supported with ``engine='pandas'``.
+
+    Return JsonReader object for iteration.
     See the `line-delimited json docs
     <http://pandas.pydata.org/pandas-docs/stable/io.html#io-jsonl>`_
     for more information on ``chunksize``.
@@ -554,16 +667,33 @@
     otherwise. If using 'zip', the ZIP file must contain only one data
     file to be read in. Set to None for no decompression.
 byte_range : list or tuple, default None
-    Byte range within the input file to be read (cudf engine only).
+
+    .. admonition:: GPU-accelerated
+
+       This parameter is only supported with ``engine='cudf'``.
+
+    Byte range within the input file to be read.
     The first number is the offset in bytes, the second number is the range
     size in bytes. Set the size to zero to read all data after the offset
     location. Reads the row that starts before or at the end of the range,
     even if it ends after the end of the range.
 keep_quotes : bool, default False
+
+    .. admonition:: GPU-accelerated experimental feature
+
+       This parameter is only supported with ``engine='cudf_experimental'``.
+
     This parameter is only supported in ``cudf_experimental`` engine.
     If `True`, any string values are read literally (and wrapped in an
     additional set of quotes).
     If `False` string values are parsed into Python strings.
+storage_options : dict, optional, default None
+    Extra options that make sense for a particular storage connection,
+    e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
+    pairs are forwarded to ``urllib.request.Request`` as header options.
+    For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
+    pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
+    ``urllib`` for more details.
 
 Returns
 -------
@@ -899,16 +1029,22 @@
     the column names: if no names are passed, header=0;
     if column names are passed explicitly, header=None.
 names : list of str, default None
-    List of column names to be used.
+    List of column names to be used. Needs to include names of all columns in
+    the file, or names of all columns selected using `usecols` (only when
+    `usecols` holds integer indices). When `usecols` is not used to select
+    column indices, `names` can contain more names than there are columns i.n
+    the file. In this case the extra columns will only contain null rows.
 index_col : int, string or False, default None
     Column to use as the row labels of the DataFrame. Passing `index_col=False`
     explicitly disables index column inference and discards the last column.
 usecols : list of int or str, default None
     Returns subset of the columns given in the list. All elements must be
     either integer indices (column number) or strings that correspond to
-    column names
+    column names. When an integer index is passed for each name in the `names`
+    parameter, the names are interpreted as names in the output table, not as
+    names in the input file.
 prefix : str, default None
-    Prefix to add to column numbers when parsing without a header row
+    Prefix to add to column numbers when parsing without a header row.
 mangle_dupe_cols : boolean, default True
     Duplicate columns will be specified as 'X','X.1',...'X.N'.
 dtype : type, str, list of types, or dict of column -> type, default None
@@ -916,7 +1052,7 @@
     are mapped to the particular type passed. If list, types are applied in
     the same order as the column names. If dict, types are mapped to the
     column names.
-    E.g. {{‘a’: np.float64, ‘b’: int32, ‘c’: ‘float’}}
+    E.g. {{'a': np.float64, 'b': int32, 'c': 'float'}}
     If `None`, dtypes are inferred from the dataset. Use `str` to preserve data
     and not infer or interpret to dtype.
 true_values : list, default None
@@ -954,9 +1090,9 @@
 dayfirst : bool, default False
     DD/MM format dates, international and European format.
 compression : {{'infer', 'gzip', 'zip', None}}, default 'infer'
-    For on-the-fly decompression of on-disk data. If ‘infer’, then detect
-    compression from the following extensions: ‘.gz’,‘.zip’ (otherwise no
-    decompression). If using ‘zip’, the ZIP file must contain only one
+    For on-the-fly decompression of on-disk data. If 'infer', then detect
+    compression from the following extensions: '.gz','.zip' (otherwise no
+    decompression). If using 'zip', the ZIP file must contain only one
     data file to be read in, otherwise the first non-zero-sized file will
     be used. Set to None for no decompression.
 thousands : char, default None
@@ -994,7 +1130,7 @@
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
     pairs are forwarded to ``urllib.request.Request`` as header options.
-    For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value
+    For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
     pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
     ``urllib`` for more details.
 bytes_per_thread : int, default None
@@ -1002,7 +1138,7 @@
     files in parallel. When there is a file of large size, we get slightly
     better throughput by decomposing it and transferring multiple "blocks"
     in parallel (using a python thread pool). Default allocation is
-    256_000_000 bytes.
+    {bytes_per_thread} bytes.
     This parameter is functional only when `use_python_file_object=False`.
 Returns
 -------
@@ -1040,7 +1176,8 @@
 --------
 cudf.DataFrame.to_csv
 """.format(
-    remote_data_sources=_docstring_remote_sources
+    remote_data_sources=_docstring_remote_sources,
+    bytes_per_thread=_BYTES_PER_THREAD_DEFAULT,
 )
 doc_read_csv = docfmt_partial(docstring=_docstring_read_csv)
 
@@ -1079,7 +1216,7 @@
     Write out the index as a column
 encoding : str, default 'utf-8'
     A string representing the encoding to use in the output file
-    Only ‘utf-8’ is currently supported
+    Only 'utf-8' is currently supported
 compression : str, None
     A string representing the compression scheme to use in the the output file
     Compression while writing csv is not supported currently
@@ -1090,7 +1227,7 @@
     Extra options that make sense for a particular storage connection,
     e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
     pairs are forwarded to ``urllib.request.Request`` as header options.
-    For other URLs (e.g. starting with “s3://”, and “gcs://”) the key-value
+    For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
     pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
     ``urllib`` for more details.
 Returns
@@ -1172,6 +1309,30 @@
     The output contains all rows that start inside the byte range
     (i.e. at or after the offset, and before the end at `offset + size`),
     which may include rows that continue past the end.
+strip_delimiters : boolean, default False
+    Unlike the `str.split()` function, `read_text` preserves the delimiter
+    at the end of a field in output by default, meaning `a;b;c` will turn into
+    `['a;','b;','c']` when using `;` as a delimiter.
+    Setting this option to `True` will strip these trailing delimiters,
+    leaving only the contents between delimiters in the resulting column:
+    `['a','b','c']`
+compression : string, default None
+    Which compression type is the input compressed with.
+    Currently supports only `bgzip`, and requires the path to a file as input.
+compression_offsets: list or tuple, default None
+    The virtual begin and end offset associated with the provided compression.
+    For `bgzip`, they are composed of a local uncompressed offset inside a
+    BGZIP block (lower 16 bits) and the start offset of this BGZIP block in the
+    compressed file (upper 48 bits).
+    The start offset points to the first byte to be read, the end offset points
+    one past the last byte to be read.
+storage_options : dict, optional, default None
+    Extra options that make sense for a particular storage connection,
+    e.g. host, port, username, password, etc. For HTTP(S) URLs the key-value
+    pairs are forwarded to ``urllib.request.Request`` as header options.
+    For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
+    pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
+    ``urllib`` for more details.
 
 Returns
 -------
@@ -1181,6 +1342,66 @@
 doc_read_text = docfmt_partial(docstring=_docstring_text_datasource)
 
 
+_docstring_get_reader_filepath_or_buffer = """
+Return either a filepath string to data, or a memory buffer of data.
+If filepath, then the source filepath is expanded to user's environment.
+If buffer, then data is returned in-memory as bytes or a ByteIO object.
+
+Parameters
+----------
+path_or_data : str, file-like object, bytes, ByteIO
+    Path to data or the data itself.
+compression : str
+    Type of compression algorithm for the content
+mode : str
+    Mode in which file is opened
+iotypes : (), default (BytesIO)
+    Object type to exclude from file-like check
+use_python_file_object : boolean, default False
+    If True, Arrow-backed PythonFile objects will be used in place
+    of fsspec AbstractBufferedFile objects.
+open_file_options : dict, optional
+    Optional dictionary of keyword arguments to pass to
+    `_open_remote_files` (used for remote storage only).
+allow_raw_text_input : boolean, default False
+    If True, this indicates the input `path_or_data` could be a raw text
+    input and will not check for its existence in the filesystem. If False,
+    the input must be a path and an error will be raised if it does not
+    exist.
+storage_options : dict, optional
+    Extra options that make sense for a particular storage connection, e.g.
+    host, port, username, password, etc. For HTTP(S) URLs the key-value
+    pairs are forwarded to ``urllib.request.Request`` as header options.
+    For other URLs (e.g. starting with "s3://", and "gcs://") the key-value
+    pairs are forwarded to ``fsspec.open``. Please see ``fsspec`` and
+    ``urllib`` for more details, and for more examples on storage options
+    refer `here <https://pandas.pydata.org/docs/user_guide/io.html?
+    highlight=storage_options#reading-writing-remote-files>`__.
+bytes_per_thread : int, default None
+    Determines the number of bytes to be allocated per thread to read the
+    files in parallel. When there is a file of large size, we get slightly
+    better throughput by decomposing it and transferring multiple "blocks"
+    in parallel (using a Python thread pool). Default allocation is
+    {bytes_per_thread} bytes.
+    This parameter is functional only when `use_python_file_object=False`.
+
+Returns
+-------
+filepath_or_buffer : str, bytes, BytesIO, list
+    Filepath string or in-memory buffer of data or a
+    list of Filepath strings or in-memory buffers of data.
+compression : str
+    Type of compression algorithm for the content
+    """.format(
+    bytes_per_thread=_BYTES_PER_THREAD_DEFAULT
+)
+
+
+doc_get_reader_filepath_or_buffer = docfmt_partial(
+    docstring=_docstring_get_reader_filepath_or_buffer
+)
+
+
 def is_url(url):
     """Check if a string is a valid URL to a network location.
 
@@ -1229,13 +1450,12 @@ def _is_local_filesystem(fs):
     return isinstance(fs, fsspec.implementations.local.LocalFileSystem)
 
 
-def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
+def ensure_single_filepath_or_buffer(path_or_data, storage_options=None):
     """Return False if `path_or_data` resolves to multiple filepaths or
     buffers.
     """
     path_or_data = stringify_pathlike(path_or_data)
     if isinstance(path_or_data, str):
-        storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
             fs, _, paths = get_fs_token_paths(
@@ -1255,11 +1475,10 @@ def ensure_single_filepath_or_buffer(path_or_data, **kwargs):
     return True
 
 
-def is_directory(path_or_data, **kwargs):
+def is_directory(path_or_data, storage_options=None):
     """Returns True if the provided filepath is a directory"""
     path_or_data = stringify_pathlike(path_or_data)
     if isinstance(path_or_data, str):
-        storage_options = kwargs.get("storage_options")
         path_or_data = os.path.expanduser(path_or_data)
         try:
             fs = get_fs_token_paths(
@@ -1276,7 +1495,7 @@ def is_directory(path_or_data, **kwargs):
     return False
 
 
-def _get_filesystem_and_paths(path_or_data, **kwargs):
+def _get_filesystem_and_paths(path_or_data, storage_options):
     # Returns a filesystem object and the filesystem-normalized
     # paths. If `path_or_data` does not correspond to a path or
     # list of paths (or if the protocol is not supported), the
@@ -1289,7 +1508,6 @@ def _get_filesystem_and_paths(path_or_data, **kwargs):
         and isinstance(stringify_pathlike(path_or_data[0]), str)
     ):
         # Ensure we are always working with a list
-        storage_options = kwargs.get("storage_options")
         if isinstance(path_or_data, list):
             path_or_data = [
                 os.path.expanduser(stringify_pathlike(source))
@@ -1406,54 +1624,21 @@ def _open_remote_files(
     ]
 
 
+@doc_get_reader_filepath_or_buffer()
 def get_reader_filepath_or_buffer(
     path_or_data,
     compression,
     mode="rb",
     fs=None,
     iotypes=(BytesIO, NativeFile),
-    byte_ranges=None,
     use_python_file_object=False,
     open_file_options=None,
     allow_raw_text_input=False,
-    **kwargs,
+    storage_options=None,
+    bytes_per_thread=_BYTES_PER_THREAD_DEFAULT,
 ):
-    """Return either a filepath string to data, or a memory buffer of data.
-    If filepath, then the source filepath is expanded to user's environment.
-    If buffer, then data is returned in-memory as bytes or a ByteIO object.
+    """{docstring}"""
 
-    Parameters
-    ----------
-    path_or_data : str, file-like object, bytes, ByteIO
-        Path to data or the data itself.
-    compression : str
-        Type of compression algorithm for the content
-    mode : str
-        Mode in which file is opened
-    iotypes : (), default (BytesIO)
-        Object type to exclude from file-like check
-    byte_ranges : list, optional
-        List of known byte ranges that will be read from path_or_data
-    use_python_file_object : boolean, default False
-        If True, Arrow-backed PythonFile objects will be used in place
-        of fsspec AbstractBufferedFile objects.
-    open_file_options : dict, optional
-        Optional dictionary of key-word arguments to pass to
-        `_open_remote_files` (used for remote storage only).
-    allow_raw_text_input : boolean, default False
-        If True, this indicates the input `path_or_data` could be a raw text
-        input and will not check for its existence in the filesystem. If False,
-        the input must be a path and an error will be raised if it does not
-        exist.
-
-    Returns
-    -------
-    filepath_or_buffer : str, bytes, BytesIO, list
-        Filepath string or in-memory buffer of data or a
-        list of Filepath strings or in-memory buffers of data.
-    compression : str
-        Type of compression algorithm for the content
-    """
     path_or_data = stringify_pathlike(path_or_data)
 
     if isinstance(path_or_data, str):
@@ -1461,7 +1646,9 @@ def get_reader_filepath_or_buffer(
         # Get a filesystem object if one isn't already available
         paths = [path_or_data]
         if fs is None:
-            fs, paths = _get_filesystem_and_paths(path_or_data, **kwargs)
+            fs, paths = _get_filesystem_and_paths(
+                path_or_data, storage_options
+            )
             if fs is None:
                 return path_or_data, compression
 
@@ -1494,7 +1681,7 @@ def get_reader_filepath_or_buffer(
                             fpath,
                             fs=fs,
                             mode=mode,
-                            **kwargs,
+                            bytes_per_thread=bytes_per_thread,
                         )
                     )
                     for fpath in paths
@@ -1509,13 +1696,15 @@ def get_reader_filepath_or_buffer(
             path_or_data = ArrowPythonFile(path_or_data)
         else:
             path_or_data = BytesIO(
-                _fsspec_data_transfer(path_or_data, mode=mode, **kwargs)
+                _fsspec_data_transfer(
+                    path_or_data, mode=mode, bytes_per_thread=bytes_per_thread
+                )
             )
 
     return path_or_data, compression
 
 
-def get_writer_filepath_or_buffer(path_or_data, mode, **kwargs):
+def get_writer_filepath_or_buffer(path_or_data, mode, storage_options=None):
     """
     Return either a filepath string to data,
     or a open file object to the output filesystem
@@ -1526,14 +1715,23 @@ def get_writer_filepath_or_buffer(path_or_data, mode, **kwargs):
         Path to data or the data itself.
     mode : str
         Mode in which file is opened
+    storage_options : dict, optional, default None
+        Extra options that make sense for a particular storage connection,
+        e.g. host, port, username, password, etc. For HTTP(S) URLs the
+        key-value pairs are forwarded to ``urllib.request.Request`` as
+        header options. For other URLs (e.g. starting with "s3://", and
+        "gcs://") the key-value pairs are forwarded to ``fsspec.open``.
+        Please see ``fsspec`` and ``urllib`` for more details.
 
     Returns
     -------
     filepath_or_buffer : str,
         Filepath string or buffer of data
     """
+    if storage_options is None:
+        storage_options = {}
+
     if isinstance(path_or_data, str):
-        storage_options = kwargs.get("storage_options", {})
         path_or_data = os.path.expanduser(path_or_data)
         fs = get_fs_token_paths(
             path_or_data, mode=mode or "w", storage_options=storage_options
@@ -1727,11 +1925,11 @@ def _prepare_filters(filters):
     return filters
 
 
-def _ensure_filesystem(passed_filesystem, path, **kwargs):
+def _ensure_filesystem(passed_filesystem, path, storage_options):
     if passed_filesystem is None:
         return get_fs_token_paths(
             path[0] if isinstance(path, list) else path,
-            storage_options=kwargs.get("storage_options", {}),
+            storage_options={} if storage_options is None else storage_options,
         )[0]
     return passed_filesystem
 
@@ -1745,11 +1943,12 @@ def _fsspec_data_transfer(
     path_or_fob,
     fs=None,
     file_size=None,
-    bytes_per_thread=256_000_000,
+    bytes_per_thread=_BYTES_PER_THREAD_DEFAULT,
     max_gap=64_000,
     mode="rb",
-    **kwargs,
 ):
+    if bytes_per_thread is None:
+        bytes_per_thread = _BYTES_PER_THREAD_DEFAULT
 
     # Require `fs` if `path_or_fob` is not file-like
     file_like = is_file_like(path_or_fob)
@@ -1760,7 +1959,15 @@ def _fsspec_data_transfer(
 
     # Calculate total file size
     if file_like:
-        file_size = path_or_fob.size
+        try:
+            file_size = path_or_fob.size
+        except AttributeError:
+            # Find file size if there is no `size`
+            # attribute
+            old_file_position = path_or_fob.tell()
+            path_or_fob.seek(0, os.SEEK_END)
+            file_size = path_or_fob.tell()
+            path_or_fob.seek(old_file_position, os.SEEK_SET)
     file_size = file_size or fs.size(path_or_fob)
 
     # Check if a direct read makes the most sense
@@ -1782,7 +1989,6 @@ def _fsspec_data_transfer(
         byte_ranges,
         buf,
         fs=fs,
-        **kwargs,
     )
 
     return buf.tobytes()
@@ -1832,7 +2038,6 @@ def _read_byte_ranges(
     ranges,
     local_buffer,
     fs=None,
-    **kwargs,
 ):
     # Simple utility to copy remote byte ranges
     # into a local buffer for IO in libcudf
diff --git a/python/cudf/cudf/utils/metadata/orc_column_statistics.proto b/python/cudf/cudf/utils/metadata/orc_column_statistics.proto
index 9dbaa713c03..1bc0fa6f6bd 100644
--- a/python/cudf/cudf/utils/metadata/orc_column_statistics.proto
+++ b/python/cudf/cudf/utils/metadata/orc_column_statistics.proto
@@ -1,3 +1,5 @@
+syntax = "proto2";
+
 message IntegerStatistics  {
   optional sint64 minimum = 1;
   optional sint64 maximum = 2;
diff --git a/python/cudf/cudf/utils/utils.py b/python/cudf/cudf/utils/utils.py
index 63bc6d59524..c5f4629483a 100644
--- a/python/cudf/cudf/utils/utils.py
+++ b/python/cudf/cudf/utils/utils.py
@@ -4,6 +4,7 @@
 import hashlib
 import os
 import traceback
+import warnings
 from functools import partial
 from typing import FrozenSet, Set, Union
 
@@ -15,7 +16,7 @@
 import cudf
 import cudf.api.types
 from cudf.core import column
-from cudf.core.buffer import as_device_buffer_like
+from cudf.core.buffer import as_buffer
 
 # The size of the mask in bytes
 mask_dtype = cudf.api.types.dtype(np.int32)
@@ -213,6 +214,15 @@ def set_allocator(
         Enable logging (default ``False``).
         Enabling this option will introduce performance overhead.
     """
+    warnings.warn(
+        "The cudf.set_allocator function is deprecated and will be removed in "
+        "a future release. Please use rmm.reinitialize "
+        "(https://docs.rapids.ai/api/rmm/stable/api.html#rmm.reinitialize) "
+        'instead. Note that `cudf.set_allocator(allocator="managed")` is '
+        "equivalent to `rmm.reinitialize(managed_memory=True)`.",
+        FutureWarning,
+    )
+
     use_managed_memory = allocator == "managed"
 
     rmm.reinitialize(
@@ -283,8 +293,8 @@ def pa_mask_buffer_to_mask(mask_buf, size):
     if mask_buf.size < mask_size:
         dbuf = rmm.DeviceBuffer(size=mask_size)
         dbuf.copy_from_host(np.asarray(mask_buf).view("u1"))
-        return as_device_buffer_like(dbuf)
-    return as_device_buffer_like(mask_buf)
+        return as_buffer(dbuf)
+    return as_buffer(mask_buf)
 
 
 def _isnat(val):
diff --git a/python/cudf/pyproject.toml b/python/cudf/pyproject.toml
index 52490444dba..c3d96e2ea3f 100644
--- a/python/cudf/pyproject.toml
+++ b/python/cudf/pyproject.toml
@@ -7,6 +7,12 @@ requires = [
     "setuptools",
     "cython>=0.29,<0.30",
     "scikit-build>=0.13.1",
-    "cmake>=3.23.1",
+    "cmake>=3.23.1,!=3.25.0",
     "ninja",
+    "numpy",
+    "pyarrow==9.0.0",
+    "protoc-wheel",
+    "versioneer",
 ]
+build-backend = "backend"
+backend-path = ["_custom_build"]
diff --git a/python/cudf/setup.py b/python/cudf/setup.py
index 93948afc0f6..2d5defc2849 100644
--- a/python/cudf/setup.py
+++ b/python/cudf/setup.py
@@ -1,20 +1,16 @@
 # Copyright (c) 2018-2022, NVIDIA CORPORATION.
 
 import os
-import re
-import shutil
-import subprocess
-import sys
-from distutils.spawn import find_executable
 
 import versioneer
 from setuptools import find_packages
 from skbuild import setup
-from skbuild.command.build_ext import build_ext
+
+cuda_suffix = os.getenv("RAPIDS_PY_WHEEL_CUDA_SUFFIX", default="")
 
 install_requires = [
     "cachetools",
-    "cuda-python>=11.5,<11.7.1",
+    "cuda-python>=11.7.1,<12.0",
     "fsspec>=0.6.0",
     "numba>=0.56.2",
     "numpy",
@@ -23,6 +19,11 @@
     "pandas>=1.0,<1.6.0dev0",
     "protobuf>=3.20.1,<3.21.0a0",
     "typing_extensions",
+    "pyarrow==9.0.0",
+    f"rmm{cuda_suffix}",
+    f"ptxcompiler{cuda_suffix}",
+    f"cubinlinker{cuda_suffix}",
+    "cupy-cuda11x",
 ]
 
 extras_require = {
@@ -31,104 +32,30 @@
         "pytest-benchmark",
         "pytest-xdist",
         "hypothesis",
-        "mimesis",
+        "mimesis>=4.1.0",
         "fastavro>=0.22.9",
         "python-snappy>=0.6.0",
         "pyorc",
         "msgpack",
         "transformers<=4.10.3",
+        "tzdata",
     ]
 }
 
+if "RAPIDS_PY_WHEEL_VERSIONEER_OVERRIDE" in os.environ:
+    orig_get_versions = versioneer.get_versions
 
-def get_cuda_version_from_header(cuda_include_dir, delimeter=""):
-
-    cuda_version = None
-
-    with open(os.path.join(cuda_include_dir, "cuda.h"), encoding="utf-8") as f:
-        for line in f.readlines():
-            if re.search(r"#define CUDA_VERSION ", line) is not None:
-                cuda_version = line
-                break
-
-    if cuda_version is None:
-        raise TypeError("CUDA_VERSION not found in cuda.h")
-    cuda_version = int(cuda_version.split()[2])
-    return "%d%s%d" % (
-        cuda_version // 1000,
-        delimeter,
-        (cuda_version % 1000) // 10,
-    )
-
-
-CUDA_HOME = os.environ.get("CUDA_HOME", False)
-if not CUDA_HOME:
-    path_to_cuda_gdb = shutil.which("cuda-gdb")
-    if path_to_cuda_gdb is None:
-        raise OSError(
-            "Could not locate CUDA. "
-            "Please set the environment variable "
-            "CUDA_HOME to the path to the CUDA installation "
-            "and try again."
-        )
-    CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb))
-
-if not os.path.isdir(CUDA_HOME):
-    raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}")
-
-cuda_include_dir = os.path.join(CUDA_HOME, "include")
-install_requires.append(
-    "cupy-cuda"
-    + get_cuda_version_from_header(cuda_include_dir)
-    + ">=9.5.0,<12.0.0a0"
-)
-
-
-class build_ext_and_proto(build_ext):
-    def run(self):
-        # Get protoc
-        protoc = None
-        if "PROTOC" in os.environ and os.path.exists(os.environ["PROTOC"]):
-            protoc = os.environ["PROTOC"]
-        else:
-            protoc = find_executable("protoc")
-        if protoc is None:
-            sys.stderr.write("protoc not found")
-            sys.exit(1)
-
-        # Build .proto file
-        for source in ["cudf/utils/metadata/orc_column_statistics.proto"]:
-            output = source.replace(".proto", "_pb2.py")
-
-            if not os.path.exists(output) or (
-                os.path.getmtime(source) > os.path.getmtime(output)
-            ):
-                with open(output, "a") as src:
-                    src.write("# flake8: noqa" + os.linesep)
-                    src.write("# fmt: off" + os.linesep)
-                subprocess.check_call([protoc, "--python_out=.", source])
-                with open(output, "r+") as src:
-                    new_src_content = (
-                        "# flake8: noqa"
-                        + os.linesep
-                        + "# fmt: off"
-                        + os.linesep
-                        + src.read()
-                        + "# fmt: on"
-                        + os.linesep
-                    )
-                    src.seek(0)
-                    src.write(new_src_content)
-
-        # Run original Cython build_ext command
-        super().run()
+    version_override = os.environ["RAPIDS_PY_WHEEL_VERSIONEER_OVERRIDE"]
 
+    def get_versions():
+        data = orig_get_versions()
+        data["version"] = version_override
+        return data
 
-cmdclass = versioneer.get_cmdclass()
-cmdclass["build_ext"] = build_ext_and_proto
+    versioneer.get_versions = get_versions
 
 setup(
-    name="cudf",
+    name=f"cudf{cuda_suffix}",
     version=versioneer.get_version(),
     description="cuDF - GPU Dataframe",
     url="https://github.com/rapidsai/cudf",
@@ -143,11 +70,12 @@ def run(self):
         "Programming Language :: Python :: 3.8",
         "Programming Language :: Python :: 3.9",
     ],
+    cmdclass=versioneer.get_cmdclass(),
+    include_package_data=True,
     packages=find_packages(include=["cudf", "cudf.*"]),
     package_data={
         key: ["*.pxd"] for key in find_packages(include=["cudf._lib*"])
     },
-    cmdclass=cmdclass,
     install_requires=install_requires,
     extras_require=extras_require,
     zip_safe=False,
diff --git a/python/cudf_kafka/setup.cfg b/python/cudf_kafka/setup.cfg
index 51469097526..f884e67908b 100644
--- a/python/cudf_kafka/setup.cfg
+++ b/python/cudf_kafka/setup.cfg
@@ -41,4 +41,3 @@ skip=
     build
     dist
     __init__.py
-
diff --git a/python/custreamz/README.md b/python/custreamz/README.md
index 0bddc6473a8..a1d98425d66 100644
--- a/python/custreamz/README.md
+++ b/python/custreamz/README.md
@@ -41,15 +41,15 @@ A more detailed example of [parsing haproxy logs](https://github.com/rapidsai-co
 
 ## Quick Start
 
-Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you’re running. This provides a ready to run Docker container with cuStreamz already installed.
+Please see the [Demo Docker Repository](https://hub.docker.com/r/rapidsai/rapidsai/), choosing a tag based on the NVIDIA CUDA version you're running. This provides a ready to run Docker container with cuStreamz already installed.
 
 ## Installation
 
 
 ### CUDA/GPU requirements
 
-* CUDA 10.0+
-* NVIDIA driver 410.48+
+* CUDA 11.0+
+* NVIDIA driver 450.80.02+
 * Pascal architecture or better (Compute Capability >=6.0)
 
 ### Conda
@@ -66,4 +66,4 @@ Nightly:
 conda install -c rapidsai-nightly cudf_kafka custreamz
 ```
 
-See the [Get RAPIDS version picker](https://rapids.ai/start.html) for more OS and version info. 
+See the [Get RAPIDS version picker](https://rapids.ai/start.html) for more OS and version info.
diff --git a/python/dask_cudf/LICENSE b/python/dask_cudf/LICENSE
new file mode 120000
index 00000000000..30cff7403da
--- /dev/null
+++ b/python/dask_cudf/LICENSE
@@ -0,0 +1 @@
+../../LICENSE
\ No newline at end of file
diff --git a/python/dask_cudf/dask_cudf/backends.py b/python/dask_cudf/dask_cudf/backends.py
index 2584ac47878..c8e4e015d4a 100644
--- a/python/dask_cudf/dask_cudf/backends.py
+++ b/python/dask_cudf/dask_cudf/backends.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+import warnings
 from collections.abc import Iterator
 
 import cupy as cp
@@ -8,6 +9,8 @@
 import pyarrow as pa
 from pandas.api.types import is_scalar
 
+import dask.dataframe as dd
+from dask import config
 from dask.dataframe.core import get_parallel_type, meta_nonempty
 from dask.dataframe.dispatch import (
     categorical_dtype_dispatch,
@@ -426,3 +429,114 @@ def sizeof_cudf_dataframe(df):
 @_dask_cudf_nvtx_annotate
 def sizeof_cudf_series_index(obj):
     return obj.memory_usage()
+
+
+def _default_backend(func, *args, **kwargs):
+    # Utility to call a dask.dataframe function with
+    # the default ("pandas") backend
+
+    # NOTE: Some `CudfBackendEntrypoint` methods need to
+    # invoke the "pandas"-version of the same method, but
+    # with custom kwargs (e.g. `engine`). In these cases,
+    # an explicit "pandas" config context is needed to
+    # avoid a recursive loop
+    with config.set({"dataframe.backend": "pandas"}):
+        return func(*args, **kwargs)
+
+
+try:
+
+    # Define "cudf" backend engine to be registered with Dask
+    from dask.dataframe.backends import DataFrameBackendEntrypoint
+
+    class CudfBackendEntrypoint(DataFrameBackendEntrypoint):
+        """Backend-entrypoint class for Dask-DataFrame
+
+        This class is registered under the name "cudf" for the
+        ``dask.dataframe.backends`` entrypoint in ``setup.cfg``.
+        Dask-DataFrame will use the methods defined in this class
+        in place of ``dask.dataframe.<creation-method>`` when the
+        "dataframe.backend" configuration is set to "cudf":
+
+        Examples
+        --------
+        >>> import dask
+        >>> import dask.dataframe as dd
+        >>> with dask.config.set({"dataframe.backend": "cudf"}):
+        ...     ddf = dd.from_dict({"a": range(10)})
+        >>> type(ddf)
+        <class 'dask_cudf.core.DataFrame'>
+        """
+
+        @staticmethod
+        def from_dict(
+            data,
+            npartitions,
+            orient="columns",
+            dtype=None,
+            columns=None,
+            constructor=cudf.DataFrame,
+        ):
+
+            return _default_backend(
+                dd.from_dict,
+                data,
+                npartitions=npartitions,
+                orient=orient,
+                dtype=dtype,
+                columns=columns,
+                constructor=constructor,
+            )
+
+        @staticmethod
+        def read_parquet(*args, engine=None, **kwargs):
+            from dask_cudf.io.parquet import CudfEngine
+
+            return _default_backend(
+                dd.read_parquet,
+                *args,
+                engine=CudfEngine,
+                **kwargs,
+            )
+
+        @staticmethod
+        def read_json(*args, **kwargs):
+            from dask_cudf.io.json import read_json
+
+            return read_json(*args, **kwargs)
+
+        @staticmethod
+        def read_orc(*args, **kwargs):
+            from dask_cudf.io import read_orc
+
+            return read_orc(*args, **kwargs)
+
+        @staticmethod
+        def read_csv(*args, **kwargs):
+            from dask_cudf.io import read_csv
+
+            chunksize = kwargs.pop("chunksize", None)
+            blocksize = kwargs.pop("blocksize", "default")
+            if chunksize is None and blocksize != "default":
+                chunksize = blocksize
+            return read_csv(
+                *args,
+                chunksize=chunksize,
+                **kwargs,
+            )
+
+        @staticmethod
+        def read_hdf(*args, **kwargs):
+            from dask_cudf import from_dask_dataframe
+
+            # HDF5 reader not yet implemented in cudf
+            warnings.warn(
+                "read_hdf is not yet implemented in cudf/dask_cudf. "
+                "Moving to cudf from pandas. Expect poor performance!"
+            )
+            return from_dask_dataframe(
+                _default_backend(dd.read_hdf, *args, **kwargs)
+            )
+
+except ImportError:
+    pass
diff --git a/python/dask_cudf/dask_cudf/core.py b/python/dask_cudf/dask_cudf/core.py
index 0bf39df313a..04b6ff401dc 100644
--- a/python/dask_cudf/dask_cudf/core.py
+++ b/python/dask_cudf/dask_cudf/core.py
@@ -2,10 +2,10 @@
 
 import math
 import warnings
-from distutils.version import LooseVersion
 
 import numpy as np
 import pandas as pd
+from packaging.version import parse as parse_version
 from tlz import partition_all
 
 import dask
@@ -14,7 +14,6 @@
 from dask.base import normalize_token, tokenize
 from dask.dataframe.core import (
     Scalar,
-    finalize,
     handle_out,
     make_meta as dask_make_meta,
     map_partitions,
@@ -31,7 +30,11 @@
 from dask_cudf.accessors import ListMethods, StructMethods
 from dask_cudf.sorting import _get_shuffle_type
 
-DASK_VERSION = LooseVersion(dask.__version__)
+DASK_BACKEND_SUPPORT = parse_version(dask.__version__) >= parse_version(
+    "2022.10.0"
+)
+# TODO: Remove DASK_BACKEND_SUPPORT throughout codebase
+# when dask_cudf is pinned to dask>=2022.10.0
 
 
 class _Frame(dd.core._Frame, OperatorMethodMixin):
@@ -51,35 +54,8 @@ class _Frame(dd.core._Frame, OperatorMethodMixin):
         Values along which we partition our blocks on the index
     """
 
-    __dask_scheduler__ = staticmethod(dask.get)
-
-    def __dask_postcompute__(self):
-        return finalize, ()
-
-    def __dask_postpersist__(self):
-        return type(self), (self._name, self._meta, self.divisions)
-
-    @_dask_cudf_nvtx_annotate
-    def __init__(self, dsk, name, meta, divisions):
-        if not isinstance(dsk, HighLevelGraph):
-            dsk = HighLevelGraph.from_collections(name, dsk, dependencies=[])
-        self.dask = dsk
-        self._name = name
-        meta = dask_make_meta(meta)
-        if not isinstance(meta, self._partition_type):
-            raise TypeError(
-                f"Expected meta to specify type "
-                f"{self._partition_type.__name__}, got type "
-                f"{type(meta).__name__}"
-            )
-        self._meta = meta
-        self.divisions = tuple(divisions)
-
-    def __getstate__(self):
-        return (self.dask, self._name, self._meta, self.divisions)
-
-    def __setstate__(self, state):
-        self.dask, self._name, self._meta, self.divisions = state
+    def _is_partition_type(self, meta):
+        return isinstance(meta, self._partition_type)
 
     def __repr__(self):
         s = "<dask_cudf.%s | %d tasks | %d npartitions>"
@@ -736,7 +712,7 @@ def from_dask_dataframe(df):
     return df.map_partitions(cudf.from_pandas)
 
 
-for name in [
+for name in (
     "add",
     "sub",
     "mul",
@@ -751,16 +727,13 @@ def from_dask_dataframe(df):
     "rfloordiv",
     "rmod",
     "rpow",
-]:
+):
     meth = getattr(cudf.DataFrame, name)
-    kwargs = {"original": cudf.DataFrame} if DASK_VERSION >= "2.11.1" else {}
-    DataFrame._bind_operator_method(name, meth, **kwargs)
+    DataFrame._bind_operator_method(name, meth, original=cudf.Series)
 
     meth = getattr(cudf.Series, name)
-    kwargs = {"original": cudf.Series} if DASK_VERSION >= "2.11.1" else {}
-    Series._bind_operator_method(name, meth, **kwargs)
+    Series._bind_operator_method(name, meth, original=cudf.Series)
 
-for name in ["lt", "gt", "le", "ge", "ne", "eq"]:
+for name in ("lt", "gt", "le", "ge", "ne", "eq"):
     meth = getattr(cudf.Series, name)
-    kwargs = {"original": cudf.Series} if DASK_VERSION >= "2.11.1" else {}
-    Series._bind_comparison_method(name, meth, **kwargs)
+    Series._bind_comparison_method(name, meth, original=cudf.Series)
diff --git a/python/dask_cudf/dask_cudf/groupby.py b/python/dask_cudf/dask_cudf/groupby.py
index f5258e6cab8..a56f70e7ae2 100644
--- a/python/dask_cudf/dask_cudf/groupby.py
+++ b/python/dask_cudf/dask_cudf/groupby.py
@@ -1,5 +1,6 @@
 # Copyright (c) 2020-2022, NVIDIA CORPORATION.
 
+from functools import wraps
 from typing import Set
 
 import numpy as np
@@ -16,12 +17,8 @@
 import cudf
 from cudf.utils.utils import _dask_cudf_nvtx_annotate
 
-CUMULATIVE_AGGS = (
-    "cumsum",
-    "cumcount",
-)
-
-AGGS = (
+# aggregations that are dask-cudf optimized
+OPTIMIZED_AGGS = (
     "count",
     "mean",
     "std",
@@ -34,19 +31,18 @@
     "last",
 )
 
-SUPPORTED_AGGS = (*AGGS, *CUMULATIVE_AGGS)
-
 
-def _check_groupby_supported(func):
+def _check_groupby_optimized(func):
     """
     Decorator for dask-cudf's groupby methods that returns the dask-cudf
-    method if the groupby object is supported, otherwise reverting to the
-    upstream Dask method
+    optimized method if the groupby object is supported, otherwise
+    reverting to the upstream Dask method
     """
 
+    @wraps(func)
     def wrapper(*args, **kwargs):
         gb = args[0]
-        if _groupby_supported(gb):
+        if _groupby_optimized(gb):
             return func(*args, **kwargs)
         # note that we use upstream Dask's default kwargs for this call if
         # none are specified; this shouldn't be an issue as those defaults are
@@ -94,7 +90,7 @@ def _make_groupby_method_aggs(self, agg_name):
         return {c: agg_name for c in self.obj.columns if c != self.by}
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -109,7 +105,7 @@ def count(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -124,7 +120,7 @@ def mean(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -139,7 +135,7 @@ def std(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -154,7 +150,7 @@ def var(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -169,7 +165,7 @@ def sum(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -184,7 +180,7 @@ def min(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -199,7 +195,7 @@ def max(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -214,7 +210,7 @@ def collect(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -229,7 +225,7 @@ def first(self, split_every=None, split_out=1):
         )
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -250,7 +246,7 @@ def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
 
         arg = _redirect_aggs(arg)
 
-        if _groupby_supported(self) and _aggs_supported(arg, SUPPORTED_AGGS):
+        if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
             if isinstance(self._meta.grouping.keys, cudf.MultiIndex):
                 keys = self._meta.grouping.keys.names
             else:
@@ -287,7 +283,7 @@ def __init__(self, *args, sort=None, **kwargs):
         super().__init__(*args, sort=sort, **kwargs)
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def count(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -302,7 +298,7 @@ def count(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def mean(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -317,7 +313,7 @@ def mean(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def std(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -332,7 +328,7 @@ def std(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def var(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -347,7 +343,7 @@ def var(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def sum(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -362,7 +358,7 @@ def sum(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def min(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -377,7 +373,7 @@ def min(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def max(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -392,7 +388,7 @@ def max(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def collect(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -407,7 +403,7 @@ def collect(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def first(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -422,7 +418,7 @@ def first(self, split_every=None, split_out=1):
         )[self._slice]
 
     @_dask_cudf_nvtx_annotate
-    @_check_groupby_supported
+    @_check_groupby_optimized
     def last(self, split_every=None, split_out=1):
         return groupby_agg(
             self.obj,
@@ -446,7 +442,7 @@ def aggregate(self, arg, split_every=None, split_out=1, shuffle=None):
         if not isinstance(arg, dict):
             arg = {self._slice: arg}
 
-        if _groupby_supported(self) and _aggs_supported(arg, SUPPORTED_AGGS):
+        if _groupby_optimized(self) and _aggs_optimized(arg, OPTIMIZED_AGGS):
             return groupby_agg(
                 self.obj,
                 self.by,
@@ -569,9 +565,9 @@ def groupby_agg(
     """
     # Assert that aggregations are supported
     aggs = _redirect_aggs(aggs_in)
-    if not _aggs_supported(aggs, SUPPORTED_AGGS):
+    if not _aggs_optimized(aggs, OPTIMIZED_AGGS):
         raise ValueError(
-            f"Supported aggs include {SUPPORTED_AGGS} for groupby_agg API. "
+            f"Supported aggs include {OPTIMIZED_AGGS} for groupby_agg API. "
             f"Aggregations must be specified with dict or list syntax."
         )
 
@@ -689,8 +685,13 @@ def groupby_agg(
             "with `sort=False`, or set `shuffle=True`."
         )
 
+    # Determine required columns to enable column projection
+    required_columns = list(
+        set(gb_cols).union(aggs.keys()).intersection(ddf.columns)
+    )
+
     return aca(
-        [ddf],
+        [ddf[required_columns]],
         chunk=chunk,
         chunk_kwargs=chunk_kwargs,
         combine=combine,
@@ -735,7 +736,7 @@ def _redirect_aggs(arg):
 
 
 @_dask_cudf_nvtx_annotate
-def _aggs_supported(arg, supported: set):
+def _aggs_optimized(arg, supported: set):
     """Check that aggregations in `arg` are a subset of `supported`"""
     if isinstance(arg, (list, dict)):
         if isinstance(arg, dict):
@@ -757,8 +758,8 @@ def _aggs_supported(arg, supported: set):
 
 
 @_dask_cudf_nvtx_annotate
-def _groupby_supported(gb):
-    """Check that groupby input is supported by dask-cudf"""
+def _groupby_optimized(gb):
+    """Check that groupby input can use dask-cudf optimized codepath"""
     return isinstance(gb.obj, DaskDataFrame) and (
         isinstance(gb.by, str)
         or (isinstance(gb.by, list) and all(isinstance(x, str) for x in gb.by))
@@ -830,7 +831,7 @@ def _tree_node_agg(df, gb_cols, dropna, sort, sep):
         agg = col.split(sep)[-1]
         if agg in ("count", "sum"):
             agg_dict[col] = ["sum"]
-        elif agg in SUPPORTED_AGGS:
+        elif agg in OPTIMIZED_AGGS:
             agg_dict[col] = [agg]
         else:
             raise ValueError(f"Unexpected aggregation: {agg}")
diff --git a/python/dask_cudf/dask_cudf/io/json.py b/python/dask_cudf/dask_cudf/io/json.py
index 6c3c95d1a2e..6ab2ba415a5 100644
--- a/python/dask_cudf/dask_cudf/io/json.py
+++ b/python/dask_cudf/dask_cudf/io/json.py
@@ -6,4 +6,66 @@
 
 import cudf
 
-read_json = partial(dask.dataframe.read_json, engine=cudf.read_json)
+from dask_cudf.backends import _default_backend
+
+
+def read_json(url_path, engine="auto", **kwargs):
+    """Create a dask_cudf DataFrame collection from JSON data
+
+    This function wraps ``dask.dataframe.read_json``, and passes
+    ``engine=partial(cudf.read_json, engine="auto")`` by default.
+
+    Parameters
+    ----------
+    url_path: str, list of str
+        Location to read from. If a string, can include a glob character to
+        find a set of file names.
+        Supports protocol specifications such as ``"s3://"``.
+    engine : str or Callable, default "auto"
+        If str, this value will be used as the ``engine`` argument when
+        ``cudf.read_json`` is used to create each partition. If Callable,
+        this value will be used as the underlying function used to create
+        each partition from JSON data. The default value is "auto", so
+        that ``engine=partial(cudf.read_json, engine="auto")`` will be
+        pased to ``dask.dataframe.read_json`` by default.
+    **kwargs :
+        Key-word arguments to pass through to ``dask.dataframe.read_json``.
+
+    Returns
+    -------
+    dask_cudf.DataFrame
+
+    Examples
+    --------
+    Load single file
+
+    >>> from dask_cudf import read_json
+    >>> read_json('myfile.json')  # doctest: +SKIP
+
+    Load large line-delimited JSON files using partitions of approx
+    256MB size
+
+    >>> read_json('data/file*.csv', blocksize=2**28)  # doctest: +SKIP
+
+    Load nested JSON data
+
+    >>> read_json('myfile.json', engine='cudf_experimental')  # doctest: +SKIP
+
+    See Also
+    --------
+    dask.dataframe.io.json.read_json
+    """
+
+    # TODO: Add optimized code path to leverage the
+    # `byte_range` argument in `cudf.read_json` for
+    # local storage (see `dask_cudf.read_csv`)
+    return _default_backend(
+        dask.dataframe.read_json,
+        url_path,
+        engine=(
+            partial(cudf.read_json, engine=engine)
+            if isinstance(engine, str)
+            else engine
+        ),
+        **kwargs,
+    )
diff --git a/python/dask_cudf/dask_cudf/io/parquet.py b/python/dask_cudf/dask_cudf/io/parquet.py
index e64847948cf..bd398cb9607 100644
--- a/python/dask_cudf/dask_cudf/io/parquet.py
+++ b/python/dask_cudf/dask_cudf/io/parquet.py
@@ -22,7 +22,11 @@
 from cudf.io import write_to_dataset
 from cudf.io.parquet import _default_open_file_options
 from cudf.utils.dtypes import cudf_dtype_from_pa_type
-from cudf.utils.ioutils import _is_local_filesystem, _open_remote_files
+from cudf.utils.ioutils import (
+    _ROW_GROUP_SIZE_BYTES_DEFAULT,
+    _is_local_filesystem,
+    _open_remote_files,
+)
 
 
 class CudfEngine(ArrowDatasetEngine):
@@ -292,24 +296,47 @@ def write_partition(
             preserve_index = True
         if partition_on:
             md = write_to_dataset(
-                df,
-                path,
+                df=df,
+                root_path=path,
+                compression=compression,
                 filename=filename,
                 partition_cols=partition_on,
                 fs=fs,
                 preserve_index=preserve_index,
                 return_metadata=return_metadata,
-                **kwargs,
+                statistics=kwargs.get("statistics", "ROWGROUP"),
+                int96_timestamps=kwargs.get("int96_timestamps", False),
+                row_group_size_bytes=kwargs.get(
+                    "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
+                ),
+                row_group_size_rows=kwargs.get("row_group_size_rows", None),
+                max_page_size_bytes=kwargs.get("max_page_size_bytes", None),
+                max_page_size_rows=kwargs.get("max_page_size_rows", None),
+                storage_options=kwargs.get("storage_options", None),
             )
         else:
             with fs.open(fs.sep.join([path, filename]), mode="wb") as out_file:
                 if not isinstance(out_file, IOBase):
                     out_file = BufferedWriter(out_file)
                 md = df.to_parquet(
-                    out_file,
-                    compression=compression,
+                    path=out_file,
+                    engine=kwargs.get("engine", "cudf"),
+                    index=kwargs.get("index", None),
+                    partition_cols=kwargs.get("partition_cols", None),
+                    partition_file_name=kwargs.get(
+                        "partition_file_name", None
+                    ),
+                    partition_offsets=kwargs.get("partition_offsets", None),
+                    statistics=kwargs.get("statistics", "ROWGROUP"),
+                    int96_timestamps=kwargs.get("int96_timestamps", False),
+                    row_group_size_bytes=kwargs.get(
+                        "row_group_size_bytes", _ROW_GROUP_SIZE_BYTES_DEFAULT
+                    ),
+                    row_group_size_rows=kwargs.get(
+                        "row_group_size_rows", None
+                    ),
+                    storage_options=kwargs.get("storage_options", None),
                     metadata_file_path=filename if return_metadata else None,
-                    **kwargs,
                 )
         # Return the schema needed to write the metadata
         if return_metadata:
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_csv.py b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
index 564a719fb86..7f69e208b5a 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_csv.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_csv.py
@@ -16,6 +16,22 @@
 import dask_cudf
 
 
+@pytest.mark.skipif(
+    not dask_cudf.core.DASK_BACKEND_SUPPORT,
+    reason="No backend-dispatch support",
+)
+def test_csv_roundtrip_backend_dispatch(tmp_path):
+    # Test ddf.read_csv cudf-backend dispatch
+    df = cudf.DataFrame({"x": [1, 2, 3, 4], "id": ["a", "b", "c", "d"]})
+    ddf = dask_cudf.from_cudf(df, npartitions=2)
+    csv_path = str(tmp_path / "data-*.csv")
+    ddf.to_csv(csv_path, index=False)
+    with dask.config.set({"dataframe.backend": "cudf"}):
+        ddf2 = dd.read_csv(csv_path)
+    assert isinstance(ddf2, dask_cudf.DataFrame)
+    dd.assert_eq(ddf, ddf2, check_divisions=False, check_index=False)
+
+
 def test_csv_roundtrip(tmp_path):
     df = cudf.DataFrame({"x": [1, 2, 3, 4], "id": ["a", "b", "c", "d"]})
     ddf = dask_cudf.from_cudf(df, npartitions=2)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_json.py b/python/dask_cudf/dask_cudf/io/tests/test_json.py
index 3f854bb343b..9d26bf06545 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_json.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_json.py
@@ -12,6 +12,23 @@
 import dask_cudf
 
 
+@pytest.mark.skipif(
+    not dask_cudf.core.DASK_BACKEND_SUPPORT,
+    reason="No backend-dispatch support",
+)
+def test_read_json_backend_dispatch(tmp_path):
+    # Test ddf.read_json cudf-backend dispatch
+    df1 = dask.datasets.timeseries(
+        dtypes={"x": int, "y": int}, freq="120s"
+    ).reset_index(drop=True)
+    json_path = str(tmp_path / "data-*.json")
+    df1.to_json(json_path)
+    with dask.config.set({"dataframe.backend": "cudf"}):
+        df2 = dd.read_json(json_path)
+    assert isinstance(df2, dask_cudf.DataFrame)
+    dd.assert_eq(df1, df2)
+
+
 def test_read_json(tmp_path):
     df1 = dask.datasets.timeseries(
         dtypes={"x": int, "y": int}, freq="120s"
@@ -54,3 +71,21 @@ def test_read_json_lines(lines):
         actual = dask_cudf.read_json(f, orient="records", lines=lines)
         actual_pd = pd.read_json(f, orient="records", lines=lines)
         dd.assert_eq(actual, actual_pd)
+
+
+def test_read_json_nested_experimental(tmp_path):
+    # Check that `engine="cudf_experimental"` can
+    # be used to support nested data
+    df = pd.DataFrame(
+        {
+            "a": [{"y": 2}, {"y": 4}, {"y": 6}, {"y": 8}],
+            "b": [[1, 2, 3], [4, 5], [6], [7]],
+            "c": [1, 3, 5, 7],
+        }
+    )
+    kwargs = dict(orient="records", lines=True)
+    with tmp_path / "data.json" as f:
+        df.to_json(f, **kwargs)
+        actual = dask_cudf.read_json(f, engine="cudf_experimental", **kwargs)
+        actual_pd = pd.read_json(f, **kwargs)
+        dd.assert_eq(actual, actual_pd)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_orc.py b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
index f19396a9b37..2291dfba536 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_orc.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_orc.py
@@ -6,18 +6,30 @@
 
 import pytest
 
+import dask
 from dask import dataframe as dd
 
 import cudf
 
 import dask_cudf
 
-# import pyarrow.orc as orc
-
 cur_dir = os.path.dirname(__file__)
 sample_orc = os.path.join(cur_dir, "data/orc/sample.orc")
 
 
+@pytest.mark.skipif(
+    not dask_cudf.core.DASK_BACKEND_SUPPORT,
+    reason="No backend-dispatch support",
+)
+def test_read_orc_backend_dispatch():
+    # Test ddf.read_orc cudf-backend dispatch
+    df1 = cudf.read_orc(sample_orc)
+    with dask.config.set({"dataframe.backend": "cudf"}):
+        df2 = dd.read_orc(sample_orc)
+    assert isinstance(df2, dask_cudf.DataFrame)
+    dd.assert_eq(df1, df2, check_index=False)
+
+
 def test_read_orc_defaults():
     df1 = cudf.read_orc(sample_orc)
     df2 = dask_cudf.read_orc(sample_orc)
diff --git a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
index ef5741b0539..7b9f926da3f 100644
--- a/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
+++ b/python/dask_cudf/dask_cudf/io/tests/test_parquet.py
@@ -46,6 +46,20 @@ def _divisions(setting):
     return {"gather_statistics": setting}
 
 
+@pytest.mark.skipif(
+    not dask_cudf.core.DASK_BACKEND_SUPPORT,
+    reason="No backend-dispatch support",
+)
+def test_roundtrip_backend_dispatch(tmpdir):
+    # Test ddf.read_parquet cudf-backend dispatch
+    tmpdir = str(tmpdir)
+    ddf.to_parquet(tmpdir, engine="pyarrow")
+    with dask.config.set({"dataframe.backend": "cudf"}):
+        ddf2 = dd.read_parquet(tmpdir, index=False)
+    assert isinstance(ddf2, dask_cudf.DataFrame)
+    dd.assert_eq(ddf.reset_index(drop=False), ddf2)
+
+
 @pytest.mark.parametrize("write_metadata_file", [True, False])
 @pytest.mark.parametrize("divisions", [True, False])
 def test_roundtrip_from_dask(tmpdir, divisions, write_metadata_file):
diff --git a/python/dask_cudf/dask_cudf/sorting.py b/python/dask_cudf/dask_cudf/sorting.py
index 4c2372393e5..0f2dc0d4efc 100644
--- a/python/dask_cudf/dask_cudf/sorting.py
+++ b/python/dask_cudf/dask_cudf/sorting.py
@@ -48,7 +48,10 @@ def _quantile(a, q):
     n = len(a)
     if not len(a):
         return None, n
-    return (a.quantiles(q=q.tolist(), interpolation="nearest"), n)
+    return (
+        a.quantile(q=q.tolist(), interpolation="nearest", method="table"),
+        n,
+    )
 
 
 @_dask_cudf_nvtx_annotate
@@ -133,7 +136,7 @@ def _approximate_quantile(df, q):
     final_type = df._meta._constructor
 
     # Create metadata
-    meta = df._meta_nonempty.quantiles(q=q)
+    meta = df._meta_nonempty.quantile(q=q, method="table")
 
     # Define final action (create df with quantiles as index)
     def finalize_tsk(tsk):
diff --git a/python/dask_cudf/dask_cudf/tests/test_core.py b/python/dask_cudf/dask_cudf/tests/test_core.py
index 40041fd5c0e..82fd9b86ed5 100644
--- a/python/dask_cudf/dask_cudf/tests/test_core.py
+++ b/python/dask_cudf/dask_cudf/tests/test_core.py
@@ -17,6 +17,23 @@
 import dask_cudf as dgd
 
 
+@pytest.mark.skipif(
+    not dgd.core.DASK_BACKEND_SUPPORT, reason="No backend-dispatch support"
+)
+def test_from_dict_backend_dispatch():
+    # Test ddf.from_dict cudf-backend dispatch
+    np.random.seed(0)
+    data = {
+        "x": np.random.randint(0, 5, size=10000),
+        "y": np.random.normal(size=10000),
+    }
+    expect = cudf.DataFrame(data)
+    with dask.config.set({"dataframe.backend": "cudf"}):
+        ddf = dd.from_dict(data, npartitions=2)
+    assert isinstance(ddf, dgd.DataFrame)
+    dd.assert_eq(expect, ddf)
+
+
 def test_from_cudf():
     np.random.seed(0)
 
@@ -473,6 +490,15 @@ def test_repartition_hash(by, npartitions, max_branch):
     dd.assert_eq(got_unique, expect_unique, check_index=False)
 
 
+def test_repartition_no_extra_row():
+    # see https://github.com/rapidsai/cudf/issues/11930
+    gdf = cudf.DataFrame({"a": [10, 20, 30], "b": [1, 2, 3]}).set_index("a")
+    ddf = dgd.from_cudf(gdf, npartitions=1)
+    ddf_new = ddf.repartition([0, 5, 10, 30], force=True)
+    dd.assert_eq(ddf, ddf_new)
+    dd.assert_eq(gdf, ddf_new)
+
+
 @pytest.fixture
 def pdf():
     return pd.DataFrame(
diff --git a/python/dask_cudf/dask_cudf/tests/test_groupby.py b/python/dask_cudf/dask_cudf/tests/test_groupby.py
index f2047c34684..1f018e79ff7 100644
--- a/python/dask_cudf/dask_cudf/tests/test_groupby.py
+++ b/python/dask_cudf/dask_cudf/tests/test_groupby.py
@@ -6,16 +6,28 @@
 
 import dask
 from dask import dataframe as dd
+from dask.utils_test import hlg_layer
 
 import cudf
 from cudf.core._compat import PANDAS_GE_120
 
 import dask_cudf
-from dask_cudf.groupby import AGGS, CUMULATIVE_AGGS, _aggs_supported
+from dask_cudf.groupby import OPTIMIZED_AGGS, _aggs_optimized
 
 
-@pytest.fixture
-def pdf():
+def assert_cudf_groupby_layers(ddf):
+    for prefix in ("cudf-aggregate-chunk", "cudf-aggregate-agg"):
+        try:
+            hlg_layer(ddf.dask, prefix)
+        except KeyError:
+            raise AssertionError(
+                "Expected Dask dataframe to contain groupby layer with "
+                f"prefix {prefix}"
+            )
+
+
+@pytest.fixture(params=["non_null", "null"])
+def pdf(request):
     np.random.seed(0)
 
     # note that column name "x" is a substring of the groupby key;
@@ -27,13 +39,17 @@ def pdf():
             "y": np.random.normal(size=10000),
         }
     )
+
+    # insert nulls into dataframe at random
+    if request.param == "null":
+        pdf = pdf.mask(np.random.choice([True, False], size=pdf.shape))
+
     return pdf
 
 
-@pytest.mark.parametrize("aggregation", AGGS)
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize("series", [False, True])
 def test_groupby_basic(series, aggregation, pdf):
-
     gdf = cudf.DataFrame.from_pandas(pdf)
     gdf_grouped = gdf.groupby("xx")
     ddf_grouped = dask_cudf.from_cudf(gdf, npartitions=5).groupby("xx")
@@ -42,30 +58,38 @@ def test_groupby_basic(series, aggregation, pdf):
         gdf_grouped = gdf_grouped.xx
         ddf_grouped = ddf_grouped.xx
 
-    a = getattr(gdf_grouped, aggregation)()
-    b = getattr(ddf_grouped, aggregation)().compute()
+    check_dtype = aggregation != "count"
 
-    if aggregation == "count":
-        dd.assert_eq(a, b, check_dtype=False)
-    else:
-        dd.assert_eq(a, b)
+    expect = getattr(gdf_grouped, aggregation)()
+    actual = getattr(ddf_grouped, aggregation)()
 
-    a = gdf_grouped.agg({"xx": aggregation})
-    b = ddf_grouped.agg({"xx": aggregation}).compute()
+    assert_cudf_groupby_layers(actual)
 
-    if aggregation == "count":
-        dd.assert_eq(a, b, check_dtype=False)
-    else:
-        dd.assert_eq(a, b)
+    dd.assert_eq(expect, actual, check_dtype=check_dtype)
+
+    expect = gdf_grouped.agg({"xx": aggregation})
+    actual = ddf_grouped.agg({"xx": aggregation})
+
+    assert_cudf_groupby_layers(actual)
+
+    dd.assert_eq(expect, actual, check_dtype=check_dtype)
 
 
+# TODO: explore adding support with `.agg()`
 @pytest.mark.parametrize("series", [True, False])
-@pytest.mark.parametrize("aggregation", CUMULATIVE_AGGS)
+@pytest.mark.parametrize("aggregation", ["cumsum", "cumcount"])
 def test_groupby_cumulative(aggregation, pdf, series):
     gdf = cudf.DataFrame.from_pandas(pdf)
     ddf = dask_cudf.from_cudf(gdf, npartitions=5)
 
-    gdf_grouped = gdf.groupby("xx")
+    if pdf.isna().sum().any():
+        with pytest.xfail(
+            reason="https://github.com/rapidsai/cudf/issues/12055"
+        ):
+            gdf_grouped = gdf.groupby("xx")
+    else:
+        gdf_grouped = gdf.groupby("xx")
+
     ddf_grouped = ddf.groupby("xx")
 
     if series:
@@ -73,7 +97,7 @@ def test_groupby_cumulative(aggregation, pdf, series):
         ddf_grouped = ddf_grouped.xx
 
     a = getattr(gdf_grouped, aggregation)()
-    b = getattr(ddf_grouped, aggregation)().compute()
+    b = getattr(ddf_grouped, aggregation)()
 
     if aggregation == "cumsum" and series:
         with pytest.xfail(reason="https://github.com/dask/dask/issues/9313"):
@@ -82,37 +106,35 @@ def test_groupby_cumulative(aggregation, pdf, series):
         dd.assert_eq(a, b)
 
 
+@pytest.mark.parametrize("aggregation", OPTIMIZED_AGGS)
 @pytest.mark.parametrize(
     "func",
     [
-        lambda df: df.groupby("x").agg({"y": "max"}),
-        lambda df: df.groupby("x").agg(["sum", "max"]),
-        lambda df: df.groupby("x").y.agg(["sum", "max"]),
-        lambda df: df.groupby("x").agg("sum"),
-        lambda df: df.groupby("x").y.agg("sum"),
+        lambda df, agg: df.groupby("xx").agg({"y": agg}),
+        lambda df, agg: df.groupby("xx").y.agg({"y": agg}),
+        lambda df, agg: df.groupby("xx").agg([agg]),
+        lambda df, agg: df.groupby("xx").y.agg([agg]),
+        lambda df, agg: df.groupby("xx").agg(agg),
+        lambda df, agg: df.groupby("xx").y.agg(agg),
     ],
 )
-def test_groupby_agg(func):
-    pdf = pd.DataFrame(
-        {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
-        }
-    )
-
+def test_groupby_agg(func, aggregation, pdf):
     gdf = cudf.DataFrame.from_pandas(pdf)
 
     ddf = dask_cudf.from_cudf(gdf, npartitions=5)
 
-    a = func(gdf).to_pandas()
-    b = func(ddf).compute().to_pandas()
+    actual = func(ddf, aggregation)
+    expect = func(gdf, aggregation)
 
-    a.index.name = None
-    a.name = None
-    b.index.name = None
-    b.name = None
+    check_dtype = aggregation != "count"
 
-    dd.assert_eq(a, b)
+    assert_cudf_groupby_layers(actual)
+
+    # groupby.agg should add an explicit getitem layer
+    # to improve/enable column projection
+    assert hlg_layer(actual.dask, "getitem")
+
+    dd.assert_eq(expect, actual, check_names=False, check_dtype=check_dtype)
 
 
 @pytest.mark.parametrize("split_out", [1, 3])
@@ -136,28 +158,6 @@ def test_groupby_agg_empty_partition(tmpdir, split_out):
     dd.assert_eq(gb.compute().sort_index(), expect)
 
 
-@pytest.mark.parametrize(
-    "func",
-    [lambda df: df.groupby("x").std(), lambda df: df.groupby("x").y.std()],
-)
-def test_groupby_std(func):
-    pdf = pd.DataFrame(
-        {
-            "x": np.random.randint(0, 5, size=10000),
-            "y": np.random.normal(size=10000),
-        }
-    )
-
-    gdf = cudf.DataFrame.from_pandas(pdf)
-
-    ddf = dask_cudf.from_cudf(gdf, npartitions=5)
-
-    a = func(gdf).to_pandas()
-    b = func(ddf).compute().to_pandas()
-
-    dd.assert_eq(a, b)
-
-
 # reason gotattr in cudf
 @pytest.mark.parametrize(
     "func",
@@ -710,7 +710,7 @@ def test_groupby_agg_redirect(aggregations):
     ],
 )
 def test_is_supported(arg, supported):
-    assert _aggs_supported(arg, AGGS) is supported
+    assert _aggs_optimized(arg, OPTIMIZED_AGGS) is supported
 
 
 def test_groupby_unique_lists():
@@ -746,22 +746,20 @@ def test_groupby_first_last(data, agg):
     gddf = dask_cudf.from_cudf(gdf, npartitions=2)
 
     dd.assert_eq(
-        ddf.groupby("a").agg(agg).compute(),
-        gddf.groupby("a").agg(agg).compute(),
+        ddf.groupby("a").agg(agg),
+        gddf.groupby("a").agg(agg),
     )
 
     dd.assert_eq(
-        getattr(ddf.groupby("a"), agg)().compute(),
-        getattr(gddf.groupby("a"), agg)().compute(),
+        getattr(ddf.groupby("a"), agg)(),
+        getattr(gddf.groupby("a"), agg)(),
     )
 
-    dd.assert_eq(
-        gdf.groupby("a").agg(agg), gddf.groupby("a").agg(agg).compute()
-    )
+    dd.assert_eq(gdf.groupby("a").agg(agg), gddf.groupby("a").agg(agg))
 
     dd.assert_eq(
         getattr(gdf.groupby("a"), agg)(),
-        getattr(gddf.groupby("a"), agg)().compute(),
+        getattr(gddf.groupby("a"), agg)(),
     )
 
 
diff --git a/python/dask_cudf/setup.cfg b/python/dask_cudf/setup.cfg
index 8f4c2029a87..f45bdf00430 100644
--- a/python/dask_cudf/setup.cfg
+++ b/python/dask_cudf/setup.cfg
@@ -38,3 +38,7 @@ skip=
     buck-out
     build
     dist
+
+[options.entry_points]
+dask.dataframe.backends =
+    cudf = dask_cudf.backends:CudfBackendEntrypoint
diff --git a/python/dask_cudf/setup.py b/python/dask_cudf/setup.py
index 54b8f69c6d9..651245c4a50 100644
--- a/python/dask_cudf/setup.py
+++ b/python/dask_cudf/setup.py
@@ -1,19 +1,20 @@
 # Copyright (c) 2019-2022, NVIDIA CORPORATION.
 
 import os
-import re
-import shutil
 
 import versioneer
 from setuptools import find_packages, setup
 
+cuda_suffix = os.getenv("RAPIDS_PY_WHEEL_CUDA_SUFFIX", default="")
+
 install_requires = [
-    "cudf",
-    "dask==2022.9.2",
-    "distributed==2022.9.2",
+    "dask==2022.11.1",
+    "distributed==2022.11.1",
     "fsspec>=0.6.0",
     "numpy",
     "pandas>=1.0,<1.6.0dev0",
+    f"cudf{cuda_suffix}",
+    "cupy-cuda11x",
 ]
 
 extras_require = {
@@ -21,58 +22,25 @@
         "numpy",
         "pandas>=1.0,<1.6.0dev0",
         "pytest",
+        "pytest-xdist",
         "numba>=0.56.2",
-        "dask>=2021.09.1",
-        "distributed>=2021.09.1",
     ]
 }
 
+if "RAPIDS_PY_WHEEL_VERSIONEER_OVERRIDE" in os.environ:
+    orig_get_versions = versioneer.get_versions
 
-def get_cuda_version_from_header(cuda_include_dir, delimeter=""):
-
-    cuda_version = None
-
-    with open(os.path.join(cuda_include_dir, "cuda.h"), encoding="utf-8") as f:
-        for line in f.readlines():
-            if re.search(r"#define CUDA_VERSION ", line) is not None:
-                cuda_version = line
-                break
-
-    if cuda_version is None:
-        raise TypeError("CUDA_VERSION not found in cuda.h")
-    cuda_version = int(cuda_version.split()[2])
-    return "%d%s%d" % (
-        cuda_version // 1000,
-        delimeter,
-        (cuda_version % 1000) // 10,
-    )
+    version_override = os.environ["RAPIDS_PY_WHEEL_VERSIONEER_OVERRIDE"]
 
+    def get_versions():
+        data = orig_get_versions()
+        data["version"] = version_override
+        return data
 
-CUDA_HOME = os.environ.get("CUDA_HOME", False)
-if not CUDA_HOME:
-    path_to_cuda_gdb = shutil.which("cuda-gdb")
-    if path_to_cuda_gdb is None:
-        raise OSError(
-            "Could not locate CUDA. "
-            "Please set the environment variable "
-            "CUDA_HOME to the path to the CUDA installation "
-            "and try again."
-        )
-    CUDA_HOME = os.path.dirname(os.path.dirname(path_to_cuda_gdb))
-
-if not os.path.isdir(CUDA_HOME):
-    raise OSError(f"Invalid CUDA_HOME: directory does not exist: {CUDA_HOME}")
-
-cuda_include_dir = os.path.join(CUDA_HOME, "include")
-install_requires.append(
-    "cupy-cuda"
-    + get_cuda_version_from_header(cuda_include_dir)
-    + ">=9.5.0,<12.0.0a0"
-)
-
+    versioneer.get_versions = get_versions
 
 setup(
-    name="dask-cudf",
+    name=f"dask-cudf{cuda_suffix}",
     version=versioneer.get_version(),
     description="Utilities for Dask and cuDF interactions",
     url="https://github.com/rapidsai/cudf",
diff --git a/python/strings_udf/CMakeLists.txt b/python/strings_udf/CMakeLists.txt
index fd625bbcf68..8bd156bff81 100644
--- a/python/strings_udf/CMakeLists.txt
+++ b/python/strings_udf/CMakeLists.txt
@@ -14,7 +14,7 @@
 
 cmake_minimum_required(VERSION 3.23.1 FATAL_ERROR)
 
-set(strings_udf_version 22.10.01)
+set(strings_udf_version 22.12.00)
 
 include(../../fetch_rapids.cmake)
 
diff --git a/python/strings_udf/cpp/CMakeLists.txt b/python/strings_udf/cpp/CMakeLists.txt
index e5b4aca7076..3e58d10d6e2 100644
--- a/python/strings_udf/cpp/CMakeLists.txt
+++ b/python/strings_udf/cpp/CMakeLists.txt
@@ -92,6 +92,10 @@ endfunction()
 # Create the shim library for each architecture.
 set(SHIM_CUDA_FLAGS --expt-relaxed-constexpr -rdc=true)
 
+# always build a default PTX file in case RAPIDS_NO_INITIALIZE is set and the device cc can't be
+# safely queried through a context
+list(INSERT CMAKE_CUDA_ARCHITECTURES 0 "60")
+
 list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-real" "")
 list(TRANSFORM CMAKE_CUDA_ARCHITECTURES REPLACE "-virtual" "")
 list(SORT CMAKE_CUDA_ARCHITECTURES)
diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh
new file mode 100644
index 00000000000..472101959a6
--- /dev/null
+++ b/python/strings_udf/cpp/include/cudf/strings/udf/case.cuh
@@ -0,0 +1,211 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "udf_string.cuh"
+
+#include <cudf/strings/detail/char_tables.hpp>
+#include <cudf/strings/detail/utf8.hpp>
+#include <cudf/strings/string_view.cuh>
+
+namespace cudf {
+namespace strings {
+namespace udf {
+
+/**
+ * @brief Global variables for character-type flags and case conversion
+ */
+struct chars_tables {
+  cudf::strings::detail::character_flags_table_type* flags_table;
+  cudf::strings::detail::character_cases_table_type* cases_table;
+  struct cudf::strings::detail::special_case_mapping* special_case_mapping_table;
+};
+
+namespace detail {
+
+/**
+ * @brief Utility for converting a single character
+ *
+ * There are special cases where the conversion may result in multiple characters.
+ *
+ * @param tables The char tables required for conversion
+ * @param result String to append the converted character
+ * @param code_point The code-point of the character to convert
+ * @param flag The char-type flag of the character to convert
+ */
+__device__ inline void convert_char(chars_tables const tables,
+                                    udf_string& result,
+                                    uint32_t code_point,
+                                    uint8_t flag)
+{
+  if (!cudf::strings::detail::IS_SPECIAL(flag)) {
+    result.append(cudf::strings::detail::codepoint_to_utf8(tables.cases_table[code_point]));
+    return;
+  }
+
+  // handle special case
+  auto const map =
+    tables
+      .special_case_mapping_table[cudf::strings::detail::get_special_case_hash_index(code_point)];
+  auto const output_count =
+    cudf::strings::detail::IS_LOWER(flag) ? map.num_upper_chars : map.num_lower_chars;
+  auto const* output_chars = cudf::strings::detail::IS_LOWER(flag) ? map.upper : map.lower;
+  for (uint16_t idx = 0; idx < output_count; idx++) {
+    result.append(cudf::strings::detail::codepoint_to_utf8(output_chars[idx]));
+  }
+}
+
+/**
+ * @brief Converts the given string to either upper or lower case
+ *
+ * @param tables The char tables required for conversion
+ * @param d_str Input string to convert
+ * @param case_flag Identifies upper/lower case conversion
+ * @return New string containing the converted characters
+ */
+__device__ inline udf_string convert_case(
+  chars_tables const tables,
+  string_view d_str,
+  cudf::strings::detail::character_flags_table_type case_flag)
+{
+  udf_string result;
+  for (auto const chr : d_str) {
+    auto const code_point = cudf::strings::detail::utf8_to_codepoint(chr);
+    auto const flag       = code_point <= 0x00FFFF ? tables.flags_table[code_point] : 0;
+
+    if ((flag & case_flag) || (cudf::strings::detail::IS_SPECIAL(flag) &&
+                               !cudf::strings::detail::IS_UPPER_OR_LOWER(flag))) {
+      convert_char(tables, result, code_point, flag);
+    } else {
+      result.append(chr);
+    }
+  }
+
+  return result;
+}
+
+/**
+ * @brief Utility for capitalize and title functions
+ *
+ * @tparam CapitalizeNextFn returns true if the next candidate character should be capitalized
+ * @param tables The char tables required for conversion
+ * @param d_str Input string to convert
+ * @param next_fn Function for next character capitalized
+ * @return New string containing the converted characters
+ */
+template <typename CapitalizeNextFn>
+__device__ inline udf_string capitalize(chars_tables const tables,
+                                        string_view d_str,
+                                        CapitalizeNextFn next_fn)
+{
+  udf_string result;
+  bool capitalize = true;
+  for (auto const chr : d_str) {
+    auto const code_point = cudf::strings::detail::utf8_to_codepoint(chr);
+    auto const flag       = code_point <= 0x00FFFF ? tables.flags_table[code_point] : 0;
+    auto const change_case =
+      capitalize ? cudf::strings::detail::IS_LOWER(flag) : cudf::strings::detail::IS_UPPER(flag);
+    if (change_case) {
+      detail::convert_char(tables, result, code_point, flag);
+    } else {
+      result.append(chr);
+    }
+    capitalize = next_fn(flag);
+  }
+  return result;
+}
+}  // namespace detail
+
+/**
+ * @brief Converts the given string to lower case
+ *
+ * @param tables The char tables required for conversion
+ * @param d_str Input string to convert
+ * @return New string containing the converted characters
+ */
+__device__ inline udf_string to_lower(chars_tables const tables, string_view d_str)
+{
+  cudf::strings::detail::character_flags_table_type case_flag = cudf::strings::detail::IS_UPPER(
+    cudf::strings::detail::ALL_FLAGS);  // convert only upper case characters
+  return detail::convert_case(tables, d_str, case_flag);
+}
+
+/**
+ * @brief Converts the given string to upper case
+ *
+ * @param tables The char tables required for conversion
+ * @param d_str Input string to convert
+ * @return New string containing the converted characters
+ */
+__device__ inline udf_string to_upper(chars_tables const tables, string_view d_str)
+{
+  cudf::strings::detail::character_flags_table_type case_flag = cudf::strings::detail::IS_LOWER(
+    cudf::strings::detail::ALL_FLAGS);  // convert only lower case characters
+  return detail::convert_case(tables, d_str, case_flag);
+}
+
+/**
+ * @brief Converts the given string to lower/upper case
+ *
+ * All lower case characters are converted to upper case and
+ * all upper case characters are converted to lower case.
+ *
+ * @param tables The char tables required for conversion
+ * @param d_str Input string to convert
+ * @return New string containing the converted characters
+ */
+__device__ inline udf_string swap_case(chars_tables const tables, string_view d_str)
+{
+  cudf::strings::detail::character_flags_table_type case_flag =
+    cudf::strings::detail::IS_LOWER(cudf::strings::detail::ALL_FLAGS) |
+    cudf::strings::detail::IS_UPPER(cudf::strings::detail::ALL_FLAGS);
+  return detail::convert_case(tables, d_str, case_flag);
+}
+
+/**
+ * @brief Capitalize the first character of the given string
+ *
+ * @param tables The char tables required for conversion
+ * @param d_str Input string to convert
+ * @return New string containing the converted characters
+ */
+__device__ inline udf_string capitalize(chars_tables const tables, string_view d_str)
+{
+  auto next_fn = [](cudf::strings::detail::character_flags_table_type) -> bool { return false; };
+  return detail::capitalize(tables, d_str, next_fn);
+}
+
+/**
+ * @brief Converts the given string to title case
+ *
+ * The first character after a non-character is converted to upper case.
+ * All other characters are converted to lower case.
+ *
+ * @param tables The char tables required for conversion
+ * @param d_str Input string to convert
+ * @return New string containing the converted characters
+ */
+__device__ inline udf_string title(chars_tables const tables, string_view d_str)
+{
+  auto next_fn = [](cudf::strings::detail::character_flags_table_type flag) -> bool {
+    return !cudf::strings::detail::IS_ALPHA(flag);
+  };
+  return detail::capitalize(tables, d_str, next_fn);
+}
+
+}  // namespace udf
+}  // namespace strings
+}  // namespace cudf
diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh
new file mode 100644
index 00000000000..c8c9f6e46f4
--- /dev/null
+++ b/python/strings_udf/cpp/include/cudf/strings/udf/numeric.cuh
@@ -0,0 +1,72 @@
+
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "udf_string.cuh"
+
+#include <cudf/strings/detail/convert/int_to_string.cuh>
+#include <cudf/strings/detail/convert/string_to_float.cuh>
+#include <cudf/strings/detail/convert/string_to_int.cuh>
+
+namespace cudf {
+namespace strings {
+namespace udf {
+
+/**
+ * @brief Converts a string into an integer
+ *
+ * The '+' and '-' are allowed but only at the beginning of the string.
+ * The string is expected to contain base-10 [0-9] characters only.
+ * Any other character will end the parse.
+ * Overflow of the int64 type is not detected.
+ */
+__device__ inline int64_t stoi(string_view const& d_str)
+{
+  return cudf::strings::detail::string_to_integer(d_str);
+}
+
+/**
+ * @brief Converts an integer into string
+ *
+ * @param value integer value to convert
+ */
+__device__ inline udf_string to_string(int64_t value)
+{
+  udf_string result;
+  if (value == 0) {
+    result.append("0");
+    return result;
+  }
+  result.resize(cudf::strings::detail::count_digits(value));
+  cudf::strings::detail::integer_to_string(value, result.data());
+  return result;
+}
+
+/**
+ * @brief Converts a string into a double
+ *
+ * This function supports scientific notation.
+ * Overflow goes to inf or -inf and underflow may go to 0.
+ */
+__device__ inline double stod(string_view const& d_str)
+{
+  return cudf::strings::detail::stod(d_str);
+}
+
+}  // namespace udf
+}  // namespace strings
+}  // namespace cudf
diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh
new file mode 100644
index 00000000000..d6d4ed637e9
--- /dev/null
+++ b/python/strings_udf/cpp/include/cudf/strings/udf/pad.cuh
@@ -0,0 +1,72 @@
+
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "udf_string.cuh"
+
+#include <cudf/strings/detail/pad_impl.cuh>
+
+namespace cudf {
+namespace strings {
+namespace udf {
+
+/**
+ * @brief Pad beginning and/or end of a string with the given fill character
+ *
+ * The side_type::BOTH will attempt to center the text using the `fill_char`.
+ * If `width <= d_str.length()` no change occurs and the input `d_str` is returned.
+ *
+ * @tparam side Specify where the padding should occur
+ * @param d_str String to pad
+ * @param width Minimum length in characters of the output string
+ * @param fill_char Character used for padding
+ */
+template <side_type side = side_type::RIGHT>
+__device__ udf_string pad(cudf::string_view const d_str,
+                          cudf::size_type width,
+                          cudf::string_view fill_char = cudf::string_view{" ", 1})
+{
+  if (fill_char.empty()) { return udf_string{d_str}; }
+
+  udf_string result;
+  result.resize(cudf::strings::detail::compute_padded_size(d_str, width, fill_char.size_bytes()));
+  cudf::strings::detail::pad_impl<side>(d_str, width, *fill_char.begin(), result.data());
+  return result;
+}
+
+/**
+ * @brief Pad beginning of a string with zero '0'
+ *
+ * If the `width` is smaller than the length of `d_str` no change occurs.
+ *
+ * If `d_str` starts with a sign character ('-' or '+') then '0' padding
+ * starts after the sign.
+ *
+ * @param d_str String to fill
+ * @param width Minimum length in characters of the output string (including the sign character)
+ */
+__device__ udf_string zfill(cudf::string_view const d_str, cudf::size_type width)
+{
+  udf_string result;
+  result.resize(cudf::strings::detail::compute_padded_size(d_str, width, 1));
+  cudf::strings::detail::zfill_impl(d_str, width, result.data());
+  return result;
+}
+
+}  // namespace udf
+}  // namespace strings
+}  // namespace cudf
diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh
new file mode 100644
index 00000000000..ca31425aa62
--- /dev/null
+++ b/python/strings_udf/cpp/include/cudf/strings/udf/split.cuh
@@ -0,0 +1,282 @@
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "udf_string.cuh"
+
+#include <cudf/strings/detail/split_utils.cuh>
+#include <cudf/strings/string_view.cuh>
+
+namespace cudf {
+namespace strings {
+namespace udf {
+namespace detail {
+
+/**
+ * @brief Split string using given string
+ *
+ * The caller must allocate an array of cudf::string_view to be filled
+ * in by this function. This function can be called with a `result=nullptr`
+ * to compute the number of tokens.
+ *
+ * @code{.cpp}
+ * auto d_str = cudf::string_view{"the best  of times ", 19};
+ * auto tgt = cudf::string_view{" ", 1};
+ * auto token_count = split(d_str, tgt, nullptr);
+ * auto result = new cudf::string_view[token_count];
+ * split(d_str, tgt, result);
+ * // result is array like ["the", "best", "", "of", "times", ""]
+ * @endcode
+ *
+ * @param d_str String to split
+ * @param tgt String to split on
+ * @param result Empty array to populate with output objects.
+ *               Pass `nullptr` to just get the token count.
+ * @return Number of tokens returned
+ */
+__device__ inline cudf::size_type split(cudf::string_view const d_str,
+                                        cudf::string_view const tgt,
+                                        cudf::string_view* result)
+{
+  auto const nchars     = d_str.length();
+  cudf::size_type count = 0;
+
+  cudf::size_type last_pos = 0;
+  while (last_pos <= nchars) {
+    cudf::size_type const pos = d_str.find(tgt, last_pos);
+    auto const length         = (pos < 0 ? nchars : pos) - last_pos;
+    if (result) { *result++ = d_str.substr(last_pos, length); }
+    last_pos = pos + tgt.length();
+    ++count;
+    if (pos < 0) { break; }
+  }
+
+  return count;
+}
+}  // namespace detail
+
+/**
+ * @brief Count tokens in a string without performing the split
+ *
+ * @code{.cpp}
+ * auto d_str = cudf::string_view{"the best  of times ", 19};
+ * auto tgt = cudf::string_view{" ", 1};
+ * auto token_count = count_tokens(d_str, tgt);
+ * // token_count is 6
+ * @endcode
+ *
+ * @param d_str String to split
+ * @param tgt String to split on
+ * @return Number of tokens returned
+ */
+__device__ inline cudf::size_type count_tokens(cudf::string_view const d_str,
+                                               cudf::string_view const tgt)
+{
+  return detail::split(d_str, tgt, nullptr);
+}
+
+/**
+ * @brief Split string using given string
+ *
+ * The caller must allocate an array of cudf::string_view to be filled
+ * in by this function.
+ *
+ * @code{.cpp}
+ * auto d_str = cudf::string_view{"the best  of times ", 19};
+ * auto tgt = cudf::string_view{" ", 1};
+ * auto token_count = count_tokens(d_str, tgt);
+ * auto result = new cudf::string_view[token_count];
+ * split(d_str, tgt, result);
+ * // result is array like ["the", "best", "", "of", "times", ""]
+ * @endcode
+ *
+ * @param d_str String to split
+ * @param tgt String to split on
+ * @param result Empty array to populate with output objects.
+ * @return Number of tokens returned
+ */
+__device__ inline cudf::size_type split(cudf::string_view const d_str,
+                                        cudf::string_view const tgt,
+                                        cudf::string_view* result)
+{
+  return detail::split(d_str, tgt, result);
+}
+
+/**
+ * @brief Split string using given target array
+ *
+ * @param d_str String to split
+ * @param tgt Character array encoded in UTF-8 used for identifying split points
+ * @param bytes Number of bytes to read from `tgt`
+ * @param result Empty array to populate with output objects
+ * @return Number of tokens returned
+ */
+__device__ inline cudf::size_type split(cudf::string_view const d_str,
+                                        char const* tgt,
+                                        cudf::size_type bytes,
+                                        cudf::string_view* result)
+{
+  return detail::split(d_str, cudf::string_view{tgt, bytes}, result);
+}
+
+/**
+ * @brief Split string using given target array
+ *
+ * @param d_str String to split
+ * @param tgt Null-terminated character array encoded in UTF-8 used for identifying split points
+ * @param result Empty array to populate with output objects
+ * @return Number of tokens returned
+ */
+__device__ inline cudf::size_type split(cudf::string_view const d_str,
+                                        char const* tgt,
+                                        cudf::string_view* result)
+{
+  return split(d_str, tgt, detail::bytes_in_null_terminated_string(tgt), result);
+}
+
+namespace detail {
+/**
+ * @brief Split string on whitespace
+ *
+ * The caller must allocate an array of cudf::string_view to be filled
+ * in by this function. This function can be called with a `result=nullptr`
+ * to compute the number of tokens.
+ *
+ * @code{.cpp}
+ * auto d_str = cudf::string_view{"the best  of times ", 19};
+ * auto token_count = split(d_str, nullptr);
+ * auto result = new cudf::string_view[token_count];
+ * split(d_str, result);
+ * // result is array like ["the", "best", "of", "times"]
+ * @endcode
+ *
+ * @param d_str String to split
+ * @param result Empty array to populate with output objects.
+ *               Pass `nullptr` to just get the token count.
+ * @return Number of tokens returned
+ */
+__device__ inline cudf::size_type split(cudf::string_view const d_str, cudf::string_view* result)
+{
+  cudf::strings::detail::whitespace_string_tokenizer tokenizer{d_str};
+  cudf::size_type count = 0;
+  while (tokenizer.next_token()) {
+    auto token = tokenizer.get_token();
+    if (result) { *result++ = d_str.substr(token.first, token.second - token.first); }
+    ++count;
+  }
+  return count;
+}
+}  // namespace detail
+
+/**
+ * @brief Count tokens in a string without performing the split on whitespace
+ *
+ * @code{.cpp}
+ * auto d_str = cudf::string_view{"the best  of times ", 19};
+ * auto token_count = count_tokens(d_str);
+ * // token_count is 4
+ * @endcode
+ *
+ * @param d_str String to split
+ * @return Number of tokens returned
+ */
+__device__ inline cudf::size_type count_tokens(cudf::string_view const d_str)
+{
+  return detail::split(d_str, nullptr);
+}
+
+/**
+ * @brief Split string on whitespace
+ *
+ * This will create tokens by splitting on one or more consecutive whitespace characters
+ * found in `d_str`.
+ *
+ * @param d_str String to split
+ * @param result Empty array to populate with output objects.
+ * @return Number of tokens returned
+ */
+__device__ inline cudf::size_type split(cudf::string_view const d_str, cudf::string_view* result)
+{
+  return detail::split(d_str, result);
+}
+
+/**
+ * @brief Join an array of strings with a separator
+ *
+ * @code{.cpp}
+ * auto separator = cudf::string_view{"::", 2};
+ * cudf::string_view input[] = {
+ *   cudf::string_view{"hello", 5},
+ *   cudf::string_view{"goodbye", 7},
+ *   cudf::string_view{"world", 5} };
+ *
+ * auto result = join(separator, input, 3);
+ * // result is "hello::goodbye::world"
+ * @endcode
+ *
+ * @param separator Separator string
+ * @param input An array of strings to join
+ * @param count Number of elements in `input`
+ * @return New string
+ */
+__device__ inline udf_string join(cudf::string_view const separator,
+                                  cudf::string_view* input,
+                                  cudf::size_type count)
+{
+  udf_string result{""};
+  while (count-- > 0) {
+    result += *input++;
+    if (count > 0) { result += separator; }
+  }
+  return result;
+}
+
+/**
+ * @brief Join an array of strings with a separator
+ *
+ * @param separator Null-terminated UTF-8 string
+ * @param bytes Number of bytes to read from `separator`
+ * @param input An array of strings to join
+ * @param count Number of elements in `input`
+ * @return New string
+ */
+__device__ inline udf_string join(char const* separator,
+                                  cudf::size_type bytes,
+                                  cudf::string_view* input,
+                                  cudf::size_type count)
+{
+  return join(cudf::string_view{separator, bytes}, input, count);
+}
+
+/**
+ * @brief Join an array of strings with a separator
+ *
+ * @param separator Null-terminated UTF-8 string
+ * @param input An array of strings to join
+ * @param count Number of elements in `input`
+ * @return New string
+ */
+__device__ inline udf_string join(char const* separator,
+                                  cudf::string_view* input,
+                                  cudf::size_type count)
+{
+  return join(separator, detail::bytes_in_null_terminated_string(separator), input, count);
+}
+
+}  // namespace udf
+}  // namespace strings
+}  // namespace cudf
diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh
new file mode 100644
index 00000000000..f2db3073460
--- /dev/null
+++ b/python/strings_udf/cpp/include/cudf/strings/udf/strip.cuh
@@ -0,0 +1,80 @@
+
+/*
+ * Copyright (c) 2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include "udf_string.cuh"
+
+#include <cudf/strings/detail/strip.cuh>
+#include <cudf/strings/string_view.cuh>
+
+namespace cudf {
+namespace strings {
+namespace udf {
+
+/**
+ * @brief Strip characters from the beginning and/or end of the given string
+ *
+ * The `d_to_strip` is interpreted as an array of characters to be removed.
+ * If `d_to_strip` is an empty string, whitespace characters are stripped.
+ *
+ * @code{.cpp}
+ * auto d_str = cudf::string_view{" aba ", 5};
+ * auto d_to_strip = cudf::string_view{}; // empty string
+ * auto result = strip(d_str, d_to_strip);
+ * // result is "aba"
+ * d_to_strip = cudf::string_view{" a", 2}; // space and 'a'
+ * result = strip(d_str, d_to_strip);
+ * // result is "b" ('a' or ' ' removed from the ends)
+ * @endcode
+ *
+ * @code{.cpp}
+ * auto d_str = cudf::string_view{" aba ", 5};
+ * auto d_to_strip = cudf::string_view{}; // empty string
+ * auto result = strip(d_str, d_to_strip, side_type::LEFT);
+ * // result is "aba "
+ * d_to_strip = cudf::string_view{"a ", 2}; // 'a' and space
+ * result = strip(d_str, d_to_strip, side_type::LEFT);
+ * // result is "ba " ('a' or ' ' removed from the beginning)
+ * @endcode
+ *
+ * @code{.cpp}
+ * auto d_str = cudf::string_view{" aba ", 5};
+ * auto d_to_strip = cudf::string_view{}; // empty string
+ * auto result = strip(d_str, d_to_strip, side_type::RIGHT);
+ * // result is " aba"
+ * d_to_strip = cudf::string_view{" a", 2}; // space and 'a'
+ * result = rstrip(d_str, d_to_strip, side_type::RIGHT);
+ * // result is " ab" ('a' or ' ' removed from the end)
+ * @endcode
+ *
+ * @param d_str String to strip characters from
+ * @param d_to_strip Characters to remove
+ * @param stype From where to strip the characters;
+ *              Default `BOTH` indicates stripping characters from the
+ *              beginning and the end of the input string `d_str`
+ * @return New string with characters removed
+ */
+__device__ udf_string strip(cudf::string_view const d_str,
+                            cudf::string_view const d_to_strip,
+                            side_type stype = side_type::BOTH)
+{
+  return udf_string{cudf::strings::detail::strip(d_str, d_to_strip, stype)};
+}
+
+}  // namespace udf
+}  // namespace strings
+}  // namespace cudf
diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp
index 6de9b91de08..68834afa082 100644
--- a/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp
+++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_apis.hpp
@@ -16,6 +16,7 @@
 
 #pragma once
 
+#include <cudf/column/column.hpp>
 #include <cudf/column/column_view.hpp>
 
 #include <rmm/device_buffer.hpp>
@@ -26,14 +27,47 @@ namespace cudf {
 namespace strings {
 namespace udf {
 
+class udf_string;
+
 /**
  * @brief Return a cudf::string_view array for the given strings column
  *
+ * No string data is copied so the input column controls the lifetime of the
+ * underlying strings.
+ *
+ * New device memory is allocated and returned to hold just the string_view instances.
+ *
  * @param input Strings column to convert to a string_view array.
- * @throw cudf::logic_error if input is not a strings column.
+ * @return Array of string_view objects in device memory
  */
 std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const input);
 
+/**
+ * @brief Return a STRINGS column given an array of udf_string objects
+ *
+ * This will make a copy of the strings in d_string in order to build
+ * the output column.
+ * The individual udf_strings are also cleared freeing each of their internal
+ * device memory buffers.
+ *
+ * @param d_strings Pointer to device memory of udf_string objects
+ * @param size The number of elements in the d_strings array
+ * @return A strings column copy of the udf_string objects
+ */
+std::unique_ptr<cudf::column> column_from_udf_string_array(udf_string* d_strings,
+                                                           cudf::size_type size);
+
+/**
+ * @brief Frees a vector of udf_string objects
+ *
+ * The individual udf_strings are cleared freeing each of their internal
+ * device memory buffers.
+ *
+ * @param d_strings Pointer to device memory of udf_string objects
+ * @param size The number of elements in the d_strings array
+ */
+void free_udf_string_array(udf_string* d_strings, cudf::size_type size);
+
 }  // namespace udf
 }  // namespace strings
 }  // namespace cudf
diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh
new file mode 100644
index 00000000000..5c9a02a9510
--- /dev/null
+++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.cuh
@@ -0,0 +1,457 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#pragma once
+
+#include "udf_string.hpp"
+
+#include <cudf/strings/detail/utf8.hpp>
+#include <cudf/strings/string_view.cuh>
+
+#include <algorithm>
+#include <limits>
+#include <string>
+
+namespace cudf {
+namespace strings {
+namespace udf {
+namespace detail {
+
+/**
+ * @brief Count the bytes in a null-terminated character array
+ *
+ * @param str Null-terminated string
+ * @return Number of bytes in `str` upto but not including the null-terminator
+ */
+__device__ inline static cudf::size_type bytes_in_null_terminated_string(char const* str)
+{
+  if (!str) return 0;
+  cudf::size_type bytes = 0;
+  while (*str++)
+    ++bytes;
+  return bytes;
+}
+
+}  // namespace detail
+
+/**
+ * @brief Allocate memory for strings operation
+ *
+ * @param bytes Number of bytes in to allocate
+ * @return Pointer to allocated memory
+ */
+__device__ inline char* udf_string::allocate(cudf::size_type bytes)
+{
+  char* data  = static_cast<char*>(malloc(bytes + 1));
+  data[bytes] = '\0';  // add null-terminator so we can printf strings in device code
+  return data;
+}
+
+/**
+ * @brief Free memory created by allocate()
+ *
+ * @param data Pointer to allocated memory
+ */
+__device__ inline void udf_string::deallocate(char* data)
+{
+  if (data) free(data);
+}
+
+/**
+ * @brief Allocate memory for strings operation
+ *
+ * Reallocates memory for `m_data` with new size `bytes`
+ * The original data in `m_data` is preserved up to `min(bytes,m_bytes)`
+ *
+ * @param bytes Number of bytes in to allocate
+ * @return Pointer to allocated memory
+ */
+__device__ void udf_string::reallocate(cudf::size_type bytes)
+{
+  m_capacity    = bytes;
+  auto new_data = allocate(m_capacity);
+  memcpy(new_data, m_data, std::min(m_bytes, bytes));
+  deallocate(m_data);
+  m_data = new_data;
+}
+
+__device__ inline udf_string::udf_string(char const* data, cudf::size_type bytes)
+  : m_bytes(bytes), m_capacity(bytes)
+{
+  m_data = allocate(m_capacity);
+  memcpy(m_data, data, bytes);
+}
+
+__device__ udf_string::udf_string(cudf::size_type count, cudf::char_utf8 chr)
+{
+  if (count <= 0) { return; }
+  m_bytes = m_capacity = cudf::strings::detail::bytes_in_char_utf8(chr) * count;
+  m_data               = allocate(m_capacity);
+  auto out_ptr         = m_data;
+  for (cudf::size_type idx = 0; idx < count; ++idx) {
+    out_ptr += cudf::strings::detail::from_char_utf8(chr, out_ptr);
+  }
+}
+
+__device__ inline udf_string::udf_string(char const* data)
+  : udf_string(data, detail::bytes_in_null_terminated_string(data))
+{
+}
+
+__device__ inline udf_string::udf_string(udf_string const& src)
+  : udf_string(src.m_data, src.m_bytes)
+{
+}
+
+__device__ inline udf_string::udf_string(udf_string&& src) noexcept
+  : m_data(src.m_data), m_bytes(src.m_bytes), m_capacity(src.m_capacity)
+{
+  src.m_data     = nullptr;
+  src.m_bytes    = 0;
+  src.m_capacity = 0;
+}
+
+__device__ inline udf_string::udf_string(cudf::string_view str)
+  : udf_string(str.data(), str.size_bytes())
+{
+}
+
+__device__ inline udf_string::~udf_string() { deallocate(m_data); }
+
+__device__ inline udf_string& udf_string::operator=(udf_string const& str) { return assign(str); }
+
+__device__ inline udf_string& udf_string::operator=(udf_string&& str) noexcept
+{
+  return assign(std::move(str));
+}
+
+__device__ inline udf_string& udf_string::operator=(cudf::string_view str) { return assign(str); }
+
+__device__ inline udf_string& udf_string::operator=(char const* str) { return assign(str); }
+
+__device__ udf_string& udf_string::assign(udf_string&& str) noexcept
+{
+  if (this == &str) { return *this; }
+  m_data         = str.m_data;
+  m_bytes        = str.m_bytes;
+  m_capacity     = str.m_capacity;
+  str.m_data     = nullptr;
+  str.m_bytes    = 0;
+  str.m_capacity = 0;
+  return *this;
+}
+
+__device__ udf_string& udf_string::assign(cudf::string_view str)
+{
+  return assign(str.data(), str.size_bytes());
+}
+
+__device__ udf_string& udf_string::assign(char const* str)
+{
+  return assign(str, detail::bytes_in_null_terminated_string(str));
+}
+
+__device__ udf_string& udf_string::assign(char const* str, cudf::size_type bytes)
+{
+  if (bytes >= m_capacity) {
+    deallocate(m_data);
+    m_capacity = bytes;
+    m_data     = allocate(m_capacity);
+  }
+  m_bytes = bytes;
+  memcpy(m_data, str, bytes);
+  m_data[m_bytes] = '\0';
+  return *this;
+}
+
+__device__ inline cudf::size_type udf_string::size_bytes() const noexcept { return m_bytes; }
+
+__device__ inline cudf::size_type udf_string::length() const noexcept
+{
+  return cudf::strings::detail::characters_in_string(m_data, m_bytes);
+}
+
+__device__ constexpr cudf::size_type udf_string::max_size() const noexcept
+{
+  return std::numeric_limits<cudf::size_type>::max() - 1;
+}
+
+__device__ inline char* udf_string::data() noexcept { return m_data; }
+
+__device__ inline char const* udf_string::data() const noexcept { return m_data; }
+
+__device__ inline bool udf_string::is_empty() const noexcept { return m_bytes == 0; }
+
+__device__ inline cudf::string_view::const_iterator udf_string::begin() const noexcept
+{
+  return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), 0);
+}
+
+__device__ inline cudf::string_view::const_iterator udf_string::end() const noexcept
+{
+  return cudf::string_view::const_iterator(cudf::string_view(m_data, m_bytes), length());
+}
+
+__device__ inline cudf::char_utf8 udf_string::at(cudf::size_type pos) const
+{
+  auto const offset = byte_offset(pos);
+  auto chr          = cudf::char_utf8{0};
+  if (offset < m_bytes) { cudf::strings::detail::to_char_utf8(data() + offset, chr); }
+  return chr;
+}
+
+__device__ inline cudf::char_utf8 udf_string::operator[](cudf::size_type pos) const
+{
+  return at(pos);
+}
+
+__device__ inline cudf::size_type udf_string::byte_offset(cudf::size_type pos) const
+{
+  cudf::size_type offset = 0;
+
+  auto start = m_data;
+  auto end   = start + m_bytes;
+  while ((pos > 0) && (start < end)) {
+    auto const byte       = static_cast<uint8_t>(*start++);
+    auto const char_bytes = cudf::strings::detail::bytes_in_utf8_byte(byte);
+    if (char_bytes) { --pos; }
+    offset += char_bytes;
+  }
+  return offset;
+}
+
+__device__ inline int udf_string::compare(cudf::string_view in) const noexcept
+{
+  return compare(in.data(), in.size_bytes());
+}
+
+__device__ inline int udf_string::compare(char const* data, cudf::size_type bytes) const
+{
+  auto const view = static_cast<cudf::string_view>(*this);
+  return view.compare(data, bytes);
+}
+
+__device__ inline bool udf_string::operator==(cudf::string_view rhs) const noexcept
+{
+  return m_bytes == rhs.size_bytes() && compare(rhs) == 0;
+}
+
+__device__ inline bool udf_string::operator!=(cudf::string_view rhs) const noexcept
+{
+  return compare(rhs) != 0;
+}
+
+__device__ inline bool udf_string::operator<(cudf::string_view rhs) const noexcept
+{
+  return compare(rhs) < 0;
+}
+
+__device__ inline bool udf_string::operator>(cudf::string_view rhs) const noexcept
+{
+  return compare(rhs) > 0;
+}
+
+__device__ inline bool udf_string::operator<=(cudf::string_view rhs) const noexcept
+{
+  return compare(rhs) <= 0;
+}
+
+__device__ inline bool udf_string::operator>=(cudf::string_view rhs) const noexcept
+{
+  return compare(rhs) >= 0;
+}
+
+__device__ inline void udf_string::clear() noexcept
+{
+  deallocate(m_data);
+  m_data     = nullptr;
+  m_bytes    = 0;
+  m_capacity = 0;
+}
+
+__device__ inline void udf_string::resize(cudf::size_type count)
+{
+  if (count > max_size()) { return; }
+  if (count > m_capacity) { reallocate(count); }
+
+  // add padding if necessary (null chars)
+  if (count > m_bytes) { memset(m_data + m_bytes, 0, count - m_bytes); }
+
+  m_bytes         = count;
+  m_data[m_bytes] = '\0';
+}
+
+__device__ void udf_string::reserve(cudf::size_type count)
+{
+  if (count < max_size() && count > m_capacity) { reallocate(count); }
+}
+
+__device__ cudf::size_type udf_string::capacity() const noexcept { return m_capacity; }
+
+__device__ void udf_string::shrink_to_fit()
+{
+  if (m_bytes < m_capacity) { reallocate(m_bytes); }
+}
+
+__device__ inline udf_string& udf_string::append(char const* str, cudf::size_type bytes)
+{
+  if (bytes <= 0) { return *this; }
+  auto const nbytes = m_bytes + bytes;
+  if (nbytes > m_capacity) { reallocate(2 * nbytes); }
+  memcpy(m_data + m_bytes, str, bytes);
+  m_bytes         = nbytes;
+  m_data[m_bytes] = '\0';
+  return *this;
+}
+
+__device__ inline udf_string& udf_string::append(char const* str)
+{
+  return append(str, detail::bytes_in_null_terminated_string(str));
+}
+
+__device__ inline udf_string& udf_string::append(cudf::char_utf8 chr, cudf::size_type count)
+{
+  auto d_str = udf_string(count, chr);
+  return append(d_str);
+}
+
+__device__ inline udf_string& udf_string::append(cudf::string_view in)
+{
+  return append(in.data(), in.size_bytes());
+}
+
+__device__ inline udf_string& udf_string::operator+=(cudf::string_view in) { return append(in); }
+
+__device__ inline udf_string& udf_string::operator+=(cudf::char_utf8 chr) { return append(chr); }
+
+__device__ inline udf_string& udf_string::operator+=(char const* str) { return append(str); }
+
+__device__ inline udf_string& udf_string::insert(cudf::size_type pos,
+                                                 char const* str,
+                                                 cudf::size_type in_bytes)
+{
+  return replace(pos, 0, str, in_bytes);
+}
+
+__device__ inline udf_string& udf_string::insert(cudf::size_type pos, char const* str)
+{
+  return insert(pos, str, detail::bytes_in_null_terminated_string(str));
+}
+
+__device__ inline udf_string& udf_string::insert(cudf::size_type pos, cudf::string_view in)
+{
+  return insert(pos, in.data(), in.size_bytes());
+}
+
+__device__ inline udf_string& udf_string::insert(cudf::size_type pos,
+                                                 cudf::size_type count,
+                                                 cudf::char_utf8 chr)
+{
+  return replace(pos, 0, count, chr);
+}
+
+__device__ inline udf_string udf_string::substr(cudf::size_type pos, cudf::size_type count) const
+{
+  if (pos < 0) { return udf_string{"", 0}; }
+  auto const start_pos = byte_offset(pos);
+  if (start_pos >= m_bytes) { return udf_string{"", 0}; }
+  auto const end_pos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes);
+  return udf_string{data() + start_pos, end_pos - start_pos};
+}
+
+// utility for replace()
+__device__ void udf_string::shift_bytes(cudf::size_type start_pos,
+                                        cudf::size_type end_pos,
+                                        cudf::size_type nbytes)
+{
+  if (nbytes < m_bytes) {
+    // shift bytes to the left [...wxyz] -> [wxyzxyz]
+    auto src = end_pos;
+    auto tgt = start_pos;
+    while (tgt < nbytes) {
+      m_data[tgt++] = m_data[src++];
+    }
+  } else if (nbytes > m_bytes) {
+    // shift bytes to the right [abcd...] -> [abcabcd]
+    auto src = m_bytes;
+    auto tgt = nbytes;
+    while (src > end_pos) {
+      m_data[--tgt] = m_data[--src];
+    }
+  }
+}
+
+__device__ inline udf_string& udf_string::replace(cudf::size_type pos,
+                                                  cudf::size_type count,
+                                                  char const* str,
+                                                  cudf::size_type in_bytes)
+{
+  if (pos < 0 || in_bytes < 0) { return *this; }
+  auto const start_pos = byte_offset(pos);
+  if (start_pos > m_bytes) { return *this; }
+  auto const end_pos = count < 0 ? m_bytes : std::min(byte_offset(pos + count), m_bytes);
+
+  // compute new size
+  auto const nbytes = m_bytes + in_bytes - (end_pos - start_pos);
+  if (nbytes > m_capacity) { reallocate(2 * nbytes); }
+
+  // move bytes -- make room for replacement
+  shift_bytes(start_pos + in_bytes, end_pos, nbytes);
+
+  // insert the replacement
+  memcpy(m_data + start_pos, str, in_bytes);
+
+  m_bytes         = nbytes;
+  m_data[m_bytes] = '\0';
+  return *this;
+}
+
+__device__ inline udf_string& udf_string::replace(cudf::size_type pos,
+                                                  cudf::size_type count,
+                                                  char const* str)
+{
+  return replace(pos, count, str, detail::bytes_in_null_terminated_string(str));
+}
+
+__device__ inline udf_string& udf_string::replace(cudf::size_type pos,
+                                                  cudf::size_type count,
+                                                  cudf::string_view in)
+{
+  return replace(pos, count, in.data(), in.size_bytes());
+}
+
+__device__ inline udf_string& udf_string::replace(cudf::size_type pos,
+                                                  cudf::size_type count,
+                                                  cudf::size_type chr_count,
+                                                  cudf::char_utf8 chr)
+{
+  auto d_str = udf_string(chr_count, chr);
+  return replace(pos, count, d_str);
+}
+
+__device__ udf_string& udf_string::erase(cudf::size_type pos, cudf::size_type count)
+{
+  return replace(pos, count, nullptr, 0);
+}
+
+__device__ inline cudf::size_type udf_string::char_offset(cudf::size_type byte_pos) const
+{
+  return cudf::strings::detail::characters_in_string(data(), byte_pos);
+}
+
+}  // namespace udf
+}  // namespace strings
+}  // namespace cudf
diff --git a/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp
new file mode 100644
index 00000000000..2bbda357cee
--- /dev/null
+++ b/python/strings_udf/cpp/include/cudf/strings/udf/udf_string.hpp
@@ -0,0 +1,550 @@
+/*
+ * Copyright (c) 2020-2022, NVIDIA CORPORATION.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+#pragma once
+
+#include <cudf/strings/string_view.hpp>
+
+#include <cuda_runtime.h>
+
+// This header contains all class and function declarations so that it
+// can be included in a .cpp file which only has declaration requirements
+// (i.e. sizeof, conditionally-comparable, explicit conversions, etc).
+// The definitions are coded in udf_string.cuh which is to be included
+// in .cu files that use this class in kernel calls.
+
+namespace cudf {
+namespace strings {
+namespace udf {
+
+/**
+ * @brief Device string class for use with user-defined functions
+ *
+ * This class manages a device buffer of UTF-8 encoded characters
+ * for string manipulation in a device kernel.
+ *
+ * Its methods and behavior are modelled after std::string but
+ * with special consideration for UTF-8 encoded strings and for
+ * use within a cuDF UDF.
+ */
+class udf_string {
+ public:
+  /**
+   * @brief Represents unknown character position or length
+   */
+  static constexpr cudf::size_type npos = static_cast<cudf::size_type>(-1);
+
+  /**
+   * @brief Cast to cudf::string_view operator
+   */
+  __device__ operator cudf::string_view() const { return cudf::string_view(m_data, m_bytes); }
+
+  /**
+   * @brief Create an empty string.
+   */
+  udf_string() = default;
+
+  /**
+   * @brief Create a string using existing device memory
+   *
+   * The given memory is copied into the instance returned.
+   *
+   * @param data Device pointer to UTF-8 encoded string
+   * @param bytes Number of bytes in `data`
+   */
+  __device__ udf_string(char const* data, cudf::size_type bytes);
+
+  /**
+   * @brief Create a string object from a null-terminated character array
+   *
+   * The given memory is copied into the instance returned.
+   *
+   * @param data Device pointer to UTF-8 encoded null-terminated
+   *             character array.
+   */
+  __device__ udf_string(char const* data);
+
+  /**
+   * @brief Create a string object from a cudf::string_view
+   *
+   * The input string data is copied into the instance returned.
+   *
+   * @param str String to copy
+   */
+  __device__ udf_string(cudf::string_view str);
+
+  /**
+   * @brief Create a string object with `count` copies of character `chr`
+   *
+   * @param count Number of times to copy `chr`
+   * @param chr Character from which to create the string
+   */
+  __device__ udf_string(cudf::size_type count, cudf::char_utf8 chr);
+
+  /**
+   * @brief Create a string object from another instance
+   *
+   * The string data is copied from the `src` into the instance returned.
+   *
+   * @param src String to copy
+   */
+  __device__ udf_string(udf_string const& src);
+
+  /**
+   * @brief Move a string object from an rvalue reference
+   *
+   * The string data is moved from `src` into the instance returned.
+   * The `src` will have no content.
+   *
+   * @param src String to copy
+   */
+  __device__ udf_string(udf_string&& src) noexcept;
+
+  __device__ ~udf_string();
+
+  __device__ udf_string& operator=(udf_string const&);
+  __device__ udf_string& operator=(udf_string&&) noexcept;
+  __device__ udf_string& operator=(cudf::string_view const);
+  __device__ udf_string& operator=(char const*);
+
+  /**
+   * @brief Return the number of bytes in this string
+   */
+  __device__ cudf::size_type size_bytes() const noexcept;
+
+  /**
+   * @brief Return the number of characters in this string
+   */
+  __device__ cudf::size_type length() const noexcept;
+
+  /**
+   * @brief Return the maximum number of bytes a udf_string can hold
+   */
+  __device__ constexpr cudf::size_type max_size() const noexcept;
+
+  /**
+   * @brief Return the internal pointer to the character array for this object
+   */
+  __device__ char* data() noexcept;
+  __device__ char const* data() const noexcept;
+
+  /**
+   * @brief Returns true if there are no characters in this string
+   */
+  __device__ bool is_empty() const noexcept;
+
+  /**
+   * @brief Returns an iterator that can be used to navigate through
+   *        the UTF-8 characters in this string
+   *
+   * This returns a `cudf::string_view::const_iterator` which is read-only.
+   */
+  __device__ cudf::string_view::const_iterator begin() const noexcept;
+  __device__ cudf::string_view::const_iterator end() const noexcept;
+
+  /**
+   * @brief Returns the character at the specified position
+   *
+   * This will return 0 if `pos >= length()`.
+   *
+   * @param pos Index position of character to return
+   * @return Character at position `pos`
+   */
+  __device__ cudf::char_utf8 at(cudf::size_type pos) const;
+
+  /**
+   * @brief Returns the character at the specified index
+   *
+   * This will return 0 if `pos >= length()`.
+   * Note this is read-only. Use replace() to modify a character.
+   *
+   * @param pos Index position of character to return
+   * @return Character at position `pos`
+   */
+  __device__ cudf::char_utf8 operator[](cudf::size_type pos) const;
+
+  /**
+   * @brief Return the byte offset for a given character position
+   *
+   * The byte offset for the character at `pos` such that
+   * `data() + byte_offset(pos)` points to the memory location
+   * the character at position `pos`.
+   *
+   * The behavior is undefined if `pos < 0 or pos >= length()`
+   *
+   * @param pos Index position of character to return byte offset.
+   * @return Byte offset for character at `pos`
+   */
+  __device__ cudf::size_type byte_offset(cudf::size_type pos) const;
+
+  /**
+   * @brief Comparing target string with this string
+   *
+   * @param str Target string to compare with this string
+   * @return 0  If they compare equal
+   *         <0 Either the value of the first character of this string that does
+   *            not match is ordered before the corresponding character in `str`,
+   *            or all compared characters match but the `str` string is shorter.
+   *         >0 Either the value of the first character of this string that does
+   *            not match is ordered after the corresponding character in `str`,
+   *            or all compared characters match but the `str` string is longer.
+   */
+  __device__ int compare(cudf::string_view str) const noexcept;
+
+  /**
+   * @brief Comparing target character array with this string
+   *
+   * @param str Target array of UTF-8 characters.
+   * @param bytes Number of bytes in `str`.
+   * @return 0  If they compare equal
+   *         <0 Either the value of the first character of this string that does
+   *            not match is ordered before the corresponding character in `str`,
+   *            or all compared characters match but `bytes < size_bytes()`.
+   *         >0 Either the value of the first character of this string that does
+   *            not match is ordered after the corresponding character in `str`,
+   *            or all compared characters match but `bytes > size_bytes()`.
+   */
+  __device__ int compare(char const* str, cudf::size_type bytes) const;
+
+  /**
+   * @brief Returns true if `rhs` matches this string exactly
+   */
+  __device__ bool operator==(cudf::string_view rhs) const noexcept;
+
+  /**
+   * @brief Returns true if `rhs` does not match this string
+   */
+  __device__ bool operator!=(cudf::string_view rhs) const noexcept;
+
+  /**
+   * @brief Returns true if this string is ordered before `rhs`
+   */
+  __device__ bool operator<(cudf::string_view rhs) const noexcept;
+
+  /**
+   * @brief Returns true if `rhs` is ordered before this string
+   */
+  __device__ bool operator>(cudf::string_view rhs) const noexcept;
+
+  /**
+   * @brief Returns true if this string matches or is ordered before `rhs`
+   */
+  __device__ bool operator<=(cudf::string_view rhs) const noexcept;
+
+  /**
+   * @brief Returns true if `rhs` matches or is ordered before this string
+   */
+  __device__ bool operator>=(cudf::string_view rhs) const noexcept;
+
+  /**
+   * @brief Remove all bytes from this string
+   *
+   * All pointers, references, and iterators are invalidated.
+   */
+  __device__ void clear() noexcept;
+
+  /**
+   * @brief Resizes string to contain `count` bytes
+   *
+   * If `count > size_bytes()` then zero-padding is added.
+   * If `count < size_bytes()` then the string is truncated to size `count`.
+   *
+   * All pointers, references, and iterators may be invalidated.
+   *
+   * The behavior is undefined if `count > max_size()`
+   *
+   * @param count Size in bytes of this string.
+   */
+  __device__ void resize(cudf::size_type count);
+
+  /**
+   * @brief Reserve `count` bytes in this string
+   *
+   * If `count > capacity()`, new memory is allocated and `capacity()` will
+   * be greater than or equal to `count`.
+   * There is no effect if `count <= capacity()`.
+   *
+   * @param count Total number of bytes to reserve for this string
+   */
+  __device__ void reserve(cudf::size_type count);
+
+  /**
+   * @brief Returns the number of bytes that the string has allocated
+   */
+  __device__ cudf::size_type capacity() const noexcept;
+
+  /**
+   * @brief Reduces internal allocation to just `size_bytes()`
+   *
+   * All pointers, references, and iterators may be invalidated.
+   */
+  __device__ void shrink_to_fit();
+
+  /**
+   * @brief Moves the contents of `str` into this string instance
+   *
+   * On return, the `str` will have no contents.
+   *
+   * @param str String to move
+   * @return This string with new contents
+   */
+  __device__ udf_string& assign(udf_string&& str) noexcept;
+
+  /**
+   * @brief Replaces the contents of this string with contents of `str`
+   *
+   * @param str String to copy
+   * @return This string with new contents
+   */
+  __device__ udf_string& assign(cudf::string_view str);
+
+  /**
+   * @brief Replaces the contents of this string with contents of `str`
+   *
+   * @param str Null-terminated UTF-8 character array
+   * @return This string with new contents
+   */
+  __device__ udf_string& assign(char const* str);
+
+  /**
+   * @brief Replaces the contents of this string with contents of `str`
+   *
+   * @param str UTF-8 character array
+   * @param bytes Number of bytes to copy from `str`
+   * @return This string with new contents
+   */
+  __device__ udf_string& assign(char const* str, cudf::size_type bytes);
+
+  /**
+   * @brief Append a string to the end of this string
+   *
+   * @param str String to append
+   * @return This string with the appended argument
+   */
+  __device__ udf_string& operator+=(cudf::string_view str);
+
+  /**
+   * @brief Append a character to the end of this string
+   *
+   * @param str Character to append
+   * @return This string with the appended argument
+   */
+  __device__ udf_string& operator+=(cudf::char_utf8 chr);
+
+  /**
+   * @brief Append a null-terminated device memory character array
+   * to the end of this string
+   *
+   * @param str String to append
+   * @return This string with the appended argument
+   */
+  __device__ udf_string& operator+=(char const* str);
+
+  /**
+   * @brief Append a null-terminated character array to the end of this string
+   *
+   * @param str String to append
+   * @return This string with the appended argument
+   */
+  __device__ udf_string& append(char const* str);
+
+  /**
+   * @brief Append a character array to the end of this string
+   *
+   * @param str Character array to append
+   * @param bytes Number of bytes from `str` to append.
+   * @return This string with the appended argument
+   */
+  __device__ udf_string& append(char const* str, cudf::size_type bytes);
+
+  /**
+   * @brief Append a string to the end of this string
+   *
+   * @param str String to append
+   * @return This string with the appended argument
+   */
+  __device__ udf_string& append(cudf::string_view str);
+
+  /**
+   * @brief Append a character to the end of this string
+   * a specified number of times.
+   *
+   * @param chr Character to append
+   * @param count Number of times to append `chr`
+   * @return This string with the append character(s)
+   */
+  __device__ udf_string& append(cudf::char_utf8 chr, cudf::size_type count = 1);
+
+  /**
+   * @brief Insert a string into the character position specified
+   *
+   * There is no effect if `pos < 0 or pos > length()`.
+   *
+   * @param pos Character position to begin insert
+   * @param str String to insert into this one
+   * @return This string with the inserted argument
+   */
+  __device__ udf_string& insert(cudf::size_type pos, cudf::string_view str);
+
+  /**
+   * @brief Insert a null-terminated character array into the character position specified
+   *
+   * There is no effect if `pos < 0 or pos > length()`.
+   *
+   * @param pos Character position to begin insert
+   * @param data Null-terminated character array to insert
+   * @return This string with the inserted argument
+   */
+  __device__ udf_string& insert(cudf::size_type pos, char const* data);
+
+  /**
+   * @brief Insert a character array into the character position specified
+   *
+   * There is no effect if `pos < 0 or pos > length()`.
+   *
+   * @param pos Character position to begin insert
+   * @param data Character array to insert
+   * @param bytes Number of bytes from `data` to insert
+   * @return This string with the inserted argument
+   */
+  __device__ udf_string& insert(cudf::size_type pos, char const* data, cudf::size_type bytes);
+
+  /**
+   * @brief Insert a character one or more times into the character position specified
+   *
+   * There is no effect if `pos < 0 or pos > length()`.
+   *
+   * @param pos Character position to begin insert
+   * @param count Number of times to insert `chr`
+   * @param chr Character to insert
+   * @return This string with the inserted argument
+   */
+  __device__ udf_string& insert(cudf::size_type pos, cudf::size_type count, cudf::char_utf8 chr);
+
+  /**
+   * @brief Returns a substring of this string
+   *
+   * An empty string is returned if `pos < 0 or pos >= length()`.
+   *
+   * @param pos Character position to start the substring
+   * @param count Number of characters for the substring;
+   *              This can be greater than the number of available characters.
+   *              Default npos returns characters in range `[pos, length())`.
+   * @return New string with the specified characters
+   */
+  __device__ udf_string substr(cudf::size_type pos, cudf::size_type count = npos) const;
+
+  /**
+   * @brief Replace a range of characters with a given string
+   *
+   * Replaces characters in range `[pos, pos + count]` with `str`.
+   * There is no effect if `pos < 0 or pos > length()`.
+   *
+   * If `count==0` then `str` is inserted starting at `pos`.
+   * If `count==npos` then the replacement range is `[pos,length())`.
+   *
+   * @param pos Position of first character to replace
+   * @param count Number of characters to replace
+   * @param str String to replace the given range
+   * @return This string modified with the replacement
+   */
+  __device__ udf_string& replace(cudf::size_type pos, cudf::size_type count, cudf::string_view str);
+
+  /**
+   * @brief Replace a range of characters with a null-terminated character array
+   *
+   * Replaces characters in range `[pos, pos + count)` with `data`.
+   * There is no effect if `pos < 0 or pos > length()`.
+   *
+   * If `count==0` then `data` is inserted starting at `pos`.
+   * If `count==npos` then the replacement range is `[pos,length())`.
+   *
+   * @param pos Position of first character to replace
+   * @param count Number of characters to replace
+   * @param data Null-terminated character array to replace the given range
+   * @return This string modified with the replacement
+   */
+  __device__ udf_string& replace(cudf::size_type pos, cudf::size_type count, char const* data);
+
+  /**
+   * @brief Replace a range of characters with a given character array
+   *
+   * Replaces characters in range `[pos, pos + count)` with `[data, data + bytes)`.
+   * There is no effect if `pos < 0 or pos > length()`.
+   *
+   * If `count==0` then `data` is inserted starting at `pos`.
+   * If `count==npos` then the replacement range is `[pos,length())`.
+   *
+   * @param pos Position of first character to replace
+   * @param count Number of characters to replace
+   * @param data String to replace the given range
+   * @param bytes Number of bytes from data to use for replacement
+   * @return This string modified with the replacement
+   */
+  __device__ udf_string& replace(cudf::size_type pos,
+                                 cudf::size_type count,
+                                 char const* data,
+                                 cudf::size_type bytes);
+
+  /**
+   * @brief Replace a range of characters with a character one or more times
+   *
+   * Replaces characters in range `[pos, pos + count)` with `chr` `chr_count` times.
+   * There is no effect if `pos < 0 or pos > length()`.
+   *
+   * If `count==0` then `chr` is inserted starting at `pos`.
+   * If `count==npos` then the replacement range is `[pos,length())`.
+   *
+   * @param pos Position of first character to replace
+   * @param count Number of characters to replace
+   * @param chr_count Number of times `chr` will repeated
+   * @param chr Character to use for replacement
+   * @return This string modified with the replacement
+   */
+  __device__ udf_string& replace(cudf::size_type pos,
+                                 cudf::size_type count,
+                                 cudf::size_type chr_count,
+                                 cudf::char_utf8 chr);
+
+  /**
+   * @brief Removes specified characters from this string
+   *
+   * Removes `min(count, length() - pos)` characters starting at `pos`.
+   * There is no effect if `pos < 0 or pos >= length()`.
+   *
+   * @param pos Character position to begin insert
+   * @param count Number of characters to remove starting at `pos`
+   * @return This string with remove characters
+   */
+  __device__ udf_string& erase(cudf::size_type pos, cudf::size_type count = npos);
+
+ private:
+  char* m_data{};
+  cudf::size_type m_bytes{};
+  cudf::size_type m_capacity{};
+
+  // utilities
+  __device__ char* allocate(cudf::size_type bytes);
+  __device__ void deallocate(char* data);
+  __device__ void reallocate(cudf::size_type bytes);
+  __device__ cudf::size_type char_offset(cudf::size_type byte_pos) const;
+  __device__ void shift_bytes(cudf::size_type start_pos,
+                              cudf::size_type end_pos,
+                              cudf::size_type nbytes);
+};
+
+}  // namespace udf
+}  // namespace strings
+}  // namespace cudf
diff --git a/python/strings_udf/cpp/src/strings/udf/shim.cu b/python/strings_udf/cpp/src/strings/udf/shim.cu
index 4d6690468ff..c5a446c9518 100644
--- a/python/strings_udf/cpp/src/strings/udf/shim.cu
+++ b/python/strings_udf/cpp/src/strings/udf/shim.cu
@@ -14,9 +14,12 @@
  * limitations under the License.
  */
 
+#include <cudf/strings/udf/case.cuh>
 #include <cudf/strings/udf/char_types.cuh>
 #include <cudf/strings/udf/search.cuh>
 #include <cudf/strings/udf/starts_with.cuh>
+#include <cudf/strings/udf/strip.cuh>
+#include <cudf/strings/udf/udf_string.cuh>
 
 using namespace cudf::strings::udf;
 
@@ -126,7 +129,7 @@ extern "C" __device__ int lt(bool* nb_retval, void const* str, void const* rhs)
   return 0;
 }
 
-extern "C" __device__ int pyislower(bool* nb_retval, void const* str, std::int64_t chars_table)
+extern "C" __device__ int pyislower(bool* nb_retval, void const* str, std::uintptr_t chars_table)
 {
   auto str_view = reinterpret_cast<cudf::string_view const*>(str);
 
@@ -135,7 +138,7 @@ extern "C" __device__ int pyislower(bool* nb_retval, void const* str, std::int64
   return 0;
 }
 
-extern "C" __device__ int pyisupper(bool* nb_retval, void const* str, std::int64_t chars_table)
+extern "C" __device__ int pyisupper(bool* nb_retval, void const* str, std::uintptr_t chars_table)
 {
   auto str_view = reinterpret_cast<cudf::string_view const*>(str);
 
@@ -144,7 +147,7 @@ extern "C" __device__ int pyisupper(bool* nb_retval, void const* str, std::int64
   return 0;
 }
 
-extern "C" __device__ int pyisspace(bool* nb_retval, void const* str, std::int64_t chars_table)
+extern "C" __device__ int pyisspace(bool* nb_retval, void const* str, std::uintptr_t chars_table)
 {
   auto str_view = reinterpret_cast<cudf::string_view const*>(str);
 
@@ -153,7 +156,7 @@ extern "C" __device__ int pyisspace(bool* nb_retval, void const* str, std::int64
   return 0;
 }
 
-extern "C" __device__ int pyisdecimal(bool* nb_retval, void const* str, std::int64_t chars_table)
+extern "C" __device__ int pyisdecimal(bool* nb_retval, void const* str, std::uintptr_t chars_table)
 {
   auto str_view = reinterpret_cast<cudf::string_view const*>(str);
 
@@ -162,7 +165,7 @@ extern "C" __device__ int pyisdecimal(bool* nb_retval, void const* str, std::int
   return 0;
 }
 
-extern "C" __device__ int pyisnumeric(bool* nb_retval, void const* str, std::int64_t chars_table)
+extern "C" __device__ int pyisnumeric(bool* nb_retval, void const* str, std::uintptr_t chars_table)
 {
   auto str_view = reinterpret_cast<cudf::string_view const*>(str);
 
@@ -171,7 +174,7 @@ extern "C" __device__ int pyisnumeric(bool* nb_retval, void const* str, std::int
   return 0;
 }
 
-extern "C" __device__ int pyisdigit(bool* nb_retval, void const* str, std::int64_t chars_table)
+extern "C" __device__ int pyisdigit(bool* nb_retval, void const* str, std::uintptr_t chars_table)
 {
   auto str_view = reinterpret_cast<cudf::string_view const*>(str);
 
@@ -180,7 +183,7 @@ extern "C" __device__ int pyisdigit(bool* nb_retval, void const* str, std::int64
   return 0;
 }
 
-extern "C" __device__ int pyisalnum(bool* nb_retval, void const* str, std::int64_t chars_table)
+extern "C" __device__ int pyisalnum(bool* nb_retval, void const* str, std::uintptr_t chars_table)
 {
   auto str_view = reinterpret_cast<cudf::string_view const*>(str);
 
@@ -189,7 +192,7 @@ extern "C" __device__ int pyisalnum(bool* nb_retval, void const* str, std::int64
   return 0;
 }
 
-extern "C" __device__ int pyisalpha(bool* nb_retval, void const* str, std::int64_t chars_table)
+extern "C" __device__ int pyisalpha(bool* nb_retval, void const* str, std::uintptr_t chars_table)
 {
   auto str_view = reinterpret_cast<cudf::string_view const*>(str);
 
@@ -198,7 +201,7 @@ extern "C" __device__ int pyisalpha(bool* nb_retval, void const* str, std::int64
   return 0;
 }
 
-extern "C" __device__ int pyistitle(bool* nb_retval, void const* str, std::int64_t chars_table)
+extern "C" __device__ int pyistitle(bool* nb_retval, void const* str, std::uintptr_t chars_table)
 {
   auto str_view = reinterpret_cast<cudf::string_view const*>(str);
 
@@ -215,3 +218,114 @@ extern "C" __device__ int pycount(int* nb_retval, void const* str, void const* s
   *nb_retval = count(*str_view, *substr_view);
   return 0;
 }
+
+extern "C" __device__ int udf_string_from_string_view(int* nb_retbal,
+                                                      void const* str,
+                                                      void* udf_str)
+{
+  auto str_view_ptr = reinterpret_cast<cudf::string_view const*>(str);
+  auto udf_str_ptr  = new (udf_str) udf_string;
+  *udf_str_ptr      = udf_string(*str_view_ptr);
+
+  return 0;
+}
+
+extern "C" __device__ int strip(int* nb_retval,
+                                void* udf_str,
+                                void* const* to_strip,
+                                void* const* strip_str)
+{
+  auto to_strip_ptr  = reinterpret_cast<cudf::string_view const*>(to_strip);
+  auto strip_str_ptr = reinterpret_cast<cudf::string_view const*>(strip_str);
+  auto udf_str_ptr   = new (udf_str) udf_string;
+
+  *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr);
+
+  return 0;
+}
+
+extern "C" __device__ int lstrip(int* nb_retval,
+                                 void* udf_str,
+                                 void* const* to_strip,
+                                 void* const* strip_str)
+{
+  auto to_strip_ptr  = reinterpret_cast<cudf::string_view const*>(to_strip);
+  auto strip_str_ptr = reinterpret_cast<cudf::string_view const*>(strip_str);
+  auto udf_str_ptr   = new (udf_str) udf_string;
+
+  *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr, cudf::strings::side_type::LEFT);
+
+  return 0;
+}
+
+extern "C" __device__ int rstrip(int* nb_retval,
+                                 void* udf_str,
+                                 void* const* to_strip,
+                                 void* const* strip_str)
+{
+  auto to_strip_ptr  = reinterpret_cast<cudf::string_view const*>(to_strip);
+  auto strip_str_ptr = reinterpret_cast<cudf::string_view const*>(strip_str);
+  auto udf_str_ptr   = new (udf_str) udf_string;
+
+  *udf_str_ptr = strip(*to_strip_ptr, *strip_str_ptr, cudf::strings::side_type::RIGHT);
+
+  return 0;
+}
+extern "C" __device__ int upper(int* nb_retval,
+                                void* udf_str,
+                                void const* st,
+                                std::uintptr_t flags_table,
+                                std::uintptr_t cases_table,
+                                std::uintptr_t special_table)
+{
+  auto udf_str_ptr = new (udf_str) udf_string;
+  auto st_ptr      = reinterpret_cast<cudf::string_view const*>(st);
+
+  auto flags_table_ptr =
+    reinterpret_cast<cudf::strings::detail::character_flags_table_type*>(flags_table);
+  auto cases_table_ptr =
+    reinterpret_cast<cudf::strings::detail::character_cases_table_type*>(cases_table);
+  auto special_table_ptr =
+    reinterpret_cast<cudf::strings::detail::special_case_mapping*>(special_table);
+
+  cudf::strings::udf::chars_tables tables{flags_table_ptr, cases_table_ptr, special_table_ptr};
+
+  *udf_str_ptr = to_upper(tables, *st_ptr);
+
+  return 0;
+}
+
+extern "C" __device__ int lower(int* nb_retval,
+                                void* udf_str,
+                                void const* st,
+                                std::uintptr_t flags_table,
+                                std::uintptr_t cases_table,
+                                std::uintptr_t special_table)
+{
+  auto udf_str_ptr = new (udf_str) udf_string;
+  auto st_ptr      = reinterpret_cast<cudf::string_view const*>(st);
+
+  auto flags_table_ptr =
+    reinterpret_cast<cudf::strings::detail::character_flags_table_type*>(flags_table);
+  auto cases_table_ptr =
+    reinterpret_cast<cudf::strings::detail::character_cases_table_type*>(cases_table);
+  auto special_table_ptr =
+    reinterpret_cast<cudf::strings::detail::special_case_mapping*>(special_table);
+
+  cudf::strings::udf::chars_tables tables{flags_table_ptr, cases_table_ptr, special_table_ptr};
+  *udf_str_ptr = to_lower(tables, *st_ptr);
+  return 0;
+}
+
+extern "C" __device__ int concat(int* nb_retval, void* udf_str, void* const* lhs, void* const* rhs)
+{
+  auto lhs_ptr = reinterpret_cast<cudf::string_view const*>(lhs);
+  auto rhs_ptr = reinterpret_cast<cudf::string_view const*>(rhs);
+
+  auto udf_str_ptr = new (udf_str) udf_string;
+
+  udf_string result;
+  result.append(*lhs_ptr).append(*rhs_ptr);
+  *udf_str_ptr = result;
+  return 0;
+}
diff --git a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu
index dfef1be39f5..3e6491e32e7 100644
--- a/python/strings_udf/cpp/src/strings/udf/udf_apis.cu
+++ b/python/strings_udf/cpp/src/strings/udf/udf_apis.cu
@@ -15,32 +15,107 @@
  */
 
 #include <cudf/strings/udf/udf_apis.hpp>
+#include <cudf/strings/udf/udf_string.cuh>
 
 #include <cudf/column/column_factories.hpp>
 #include <cudf/strings/detail/utilities.hpp>
 #include <cudf/strings/string_view.cuh>
+#include <cudf/utilities/default_stream.hpp>
 
 #include <rmm/device_uvector.hpp>
+#include <rmm/exec_policy.hpp>
+
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/transform.h>
 
 namespace cudf {
 namespace strings {
 namespace udf {
 namespace detail {
+namespace {
+
+/**
+ * @brief Functor wraps string_view objects around udf_string objects
+ *
+ * No string data is copied.
+ */
+struct udf_string_to_string_view_transform_fn {
+  __device__ cudf::string_view operator()(cudf::strings::udf::udf_string const& dstr)
+  {
+    return dstr.data() == nullptr ? cudf::string_view{}
+                                  : cudf::string_view{dstr.data(), dstr.size_bytes()};
+  }
+};
 
+}  // namespace
+
+/**
+ * @copydoc to_string_view_array
+ *
+ * @param stream CUDA stream used for allocating/copying device memory and launching kernels
+ */
 std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const input,
                                                          rmm::cuda_stream_view stream)
 {
   return std::make_unique<rmm::device_buffer>(
     std::move(cudf::strings::detail::create_string_vector_from_column(
-                cudf::strings_column_view(input), stream)
+                cudf::strings_column_view(input), stream, rmm::mr::get_current_device_resource())
                 .release()));
 }
 
+/**
+ * @copydoc column_from_udf_string_array
+ *
+ * @param stream CUDA stream used for allocating/copying device memory and launching kernels
+ */
+std::unique_ptr<cudf::column> column_from_udf_string_array(udf_string* d_strings,
+                                                           cudf::size_type size,
+                                                           rmm::cuda_stream_view stream)
+{
+  // create string_views of the udf_strings
+  auto indices = rmm::device_uvector<cudf::string_view>(size, stream);
+  thrust::transform(rmm::exec_policy(stream),
+                    d_strings,
+                    d_strings + size,
+                    indices.data(),
+                    udf_string_to_string_view_transform_fn{});
+
+  return cudf::make_strings_column(indices, cudf::string_view(nullptr, 0), stream);
+}
+
+/**
+ * @copydoc free_udf_string_array
+ *
+ * @param stream CUDA stream used for allocating/copying device memory and launching kernels
+ */
+void free_udf_string_array(cudf::strings::udf::udf_string* d_strings,
+                           cudf::size_type size,
+                           rmm::cuda_stream_view stream)
+{
+  thrust::for_each_n(rmm::exec_policy(stream),
+                     thrust::make_counting_iterator(0),
+                     size,
+                     [d_strings] __device__(auto idx) { d_strings[idx].clear(); });
+}
+
 }  // namespace detail
 
+// external APIs
+
 std::unique_ptr<rmm::device_buffer> to_string_view_array(cudf::column_view const input)
 {
-  return detail::to_string_view_array(input, rmm::cuda_stream_default);
+  return detail::to_string_view_array(input, cudf::get_default_stream());
+}
+
+std::unique_ptr<cudf::column> column_from_udf_string_array(udf_string* d_strings,
+                                                           cudf::size_type size)
+{
+  return detail::column_from_udf_string_array(d_strings, size, cudf::get_default_stream());
+}
+
+void free_udf_string_array(udf_string* d_strings, cudf::size_type size)
+{
+  detail::free_udf_string_array(d_strings, size, cudf::get_default_stream());
 }
 
 }  // namespace udf
diff --git a/python/strings_udf/strings_udf/__init__.py b/python/strings_udf/strings_udf/__init__.py
index 2cedc0288d1..bf13b79ab90 100644
--- a/python/strings_udf/strings_udf/__init__.py
+++ b/python/strings_udf/strings_udf/__init__.py
@@ -1,36 +1,21 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 import glob
 import os
-import re
-import subprocess
-import sys
 
+from cubinlinker.patch import _numba_version_ok, get_logger, new_patched_linker
+from cuda import cudart
 from numba import cuda
-from ptxcompiler.patch import CMD
+from numba.cuda.cudadrv.driver import Linker
+from ptxcompiler.patch import NO_DRIVER, safe_get_versions
 
 from . import _version
 
 __version__ = _version.get_versions()["version"]
 
-ENABLED = False
+logger = get_logger()
 
-
-def compiler_from_ptx_file(path):
-    """Parse a PTX file header and extract the CUDA version used to compile it.
-    Here is an example PTX header that this function should parse:
-    // Generated by NVIDIA NVVM Compiler
-    //
-    // Compiler Build ID: CL-30672275
-    // Cuda compilation tools, release 11.5, V11.5.119
-    // Based on NVVM 7
-    """
-    file = open(path).read()
-    major, minor = (
-        re.search(r"Cuda compilation tools, release ([0-9\.]+)", file)
-        .group(1)
-        .split(".")
-    )
-    return int(major), int(minor)
+# tracks the version of CUDA used to build the c++ and PTX components
+STRINGS_UDF_PTX_VERSION = (11, 5)
 
 
 def _get_appropriate_file(sms, cc):
@@ -41,59 +26,95 @@ def _get_appropriate_file(sms, cc):
         return None
 
 
-# adapted from PTXCompiler
-cp = subprocess.run([sys.executable, "-c", CMD], capture_output=True)
-if cp.returncode == 0:
-    # must have a driver to proceed
-    versions = [int(s) for s in cp.stdout.strip().split()]
-    driver_version = tuple(versions[:2])
-    runtime_version = tuple(versions[2:])
+def maybe_patch_numba_linker(driver_version):
+    # Numba thinks cubinlinker is only needed if the driver is older than the ctk
+    # but when strings_udf is present, it might also need to patch because the PTX
+    # file strings_udf relies on may be newer than the driver as well
+    if driver_version < STRINGS_UDF_PTX_VERSION:
+        logger.debug(
+            "Driver version %s.%s needs patching due to strings_udf"
+            % driver_version
+        )
+        if _numba_version_ok:
+            logger.debug("Patching Numba Linker")
+            Linker.new = new_patched_linker
+        else:
+            logger.debug("Cannot patch Numba Linker - unsupported version")
+
+
+def _get_ptx_file():
+    if "RAPIDS_NO_INITIALIZE" in os.environ:
+        # shim_60.ptx is always built
+        cc = int(os.environ.get("STRINGS_UDF_CC", "60"))
+    else:
+        dev = cuda.get_current_device()
 
-    # CUDA enhanced compatibility not yet enabled
-    if driver_version >= runtime_version:
         # Load the highest compute capability file available that is less than
         # the current device's.
-        dev = cuda.get_current_device()
         cc = int("".join(str(x) for x in dev.compute_capability))
-        files = glob.glob(
-            os.path.join(os.path.dirname(__file__), "shim_*.ptx")
+    files = glob.glob(os.path.join(os.path.dirname(__file__), "shim_*.ptx"))
+    if len(files) == 0:
+        raise RuntimeError(
+            "This strings_udf installation is missing the necessary PTX "
+            f"files for compute capability {cc}. "
+            "Please file an issue reporting this error and how you "
+            "installed cudf and strings_udf."
+            "https://github.com/rapidsai/cudf/issues"
         )
-        if len(files) == 0:
-            raise RuntimeError(
-                "This strings_udf installation is missing the necessary PTX "
-                "files. Please file an issue reporting this error and how you "
-                "installed cudf and strings_udf."
-            )
-
-        suffix_a_sm = None
-        regular_sms = []
-
-        for f in files:
-            file_name = os.path.basename(f)
-            sm_number = file_name.rstrip(".ptx").lstrip("shim_")
-            if sm_number.endswith("a"):
-                processed_sm_number = int(sm_number.rstrip("a"))
-                if processed_sm_number == cc:
-                    suffix_a_sm = (processed_sm_number, f)
-            else:
-                regular_sms.append((int(sm_number), f))
-
-        regular_result = None
-
-        if regular_sms:
-            regular_result = _get_appropriate_file(regular_sms, cc)
-
-        if suffix_a_sm is None and regular_result is None:
-            raise RuntimeError(
-                "This strings_udf installation is missing the necessary PTX "
-                f"files that are <={cc}."
-            )
-        elif suffix_a_sm is not None:
-            ptxpath = suffix_a_sm[1]
-        else:
-            ptxpath = regular_result[1]
 
-        if driver_version >= compiler_from_ptx_file(ptxpath):
-            ENABLED = True
+    regular_sms = []
+
+    for f in files:
+        file_name = os.path.basename(f)
+        sm_number = file_name.rstrip(".ptx").lstrip("shim_")
+        if sm_number.endswith("a"):
+            processed_sm_number = int(sm_number.rstrip("a"))
+            if processed_sm_number == cc:
+                return f
         else:
-            del ptxpath
+            regular_sms.append((int(sm_number), f))
+
+    regular_result = None
+
+    if regular_sms:
+        regular_result = _get_appropriate_file(regular_sms, cc)
+
+    if regular_result is None:
+        raise RuntimeError(
+            "This strings_udf installation is missing the necessary PTX "
+            f"files that are <={cc}."
+        )
+    else:
+        return regular_result[1]
+
+
+# Maximum size of a string column is 2 GiB
+_STRINGS_UDF_DEFAULT_HEAP_SIZE = os.environ.get(
+    "STRINGS_UDF_HEAP_SIZE", 2**31
+)
+heap_size = 0
+
+
+def set_malloc_heap_size(size=None):
+    """
+    Heap size control for strings_udf, size in bytes.
+    """
+    global heap_size
+    if size is None:
+        size = _STRINGS_UDF_DEFAULT_HEAP_SIZE
+    if size != heap_size:
+        (ret,) = cudart.cudaDeviceSetLimit(
+            cudart.cudaLimit.cudaLimitMallocHeapSize, size
+        )
+        if ret.value != 0:
+            raise RuntimeError("Unable to set cudaMalloc heap size")
+
+        heap_size = size
+
+
+ptxpath = None
+versions = safe_get_versions()
+if versions != NO_DRIVER:
+    driver_version, runtime_version = versions
+    maybe_patch_numba_linker(driver_version)
+    ptxpath = _get_ptx_file()
diff --git a/python/strings_udf/strings_udf/_lib/CMakeLists.txt b/python/strings_udf/strings_udf/_lib/CMakeLists.txt
index 91069a43891..55a33a050e0 100644
--- a/python/strings_udf/strings_udf/_lib/CMakeLists.txt
+++ b/python/strings_udf/strings_udf/_lib/CMakeLists.txt
@@ -19,7 +19,3 @@ rapids_cython_create_modules(
   SOURCE_FILES "${cython_sources}"
   LINKED_LIBRARIES "${linked_libraries}"
 )
-
-foreach(cython_module IN LISTS _RAPIDS_CYTHON_CREATED_TARGETS)
-  set_target_properties(${cython_module} PROPERTIES INSTALL_RPATH "\$ORIGIN;\$ORIGIN/cpp")
-endforeach()
diff --git a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd
index fb8e3a949bf..b3bf6465db6 100644
--- a/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd
+++ b/python/strings_udf/strings_udf/_lib/cpp/strings_udf.pxd
@@ -1,6 +1,6 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t
+from libc.stdint cimport uint8_t, uint16_t
 from libcpp.memory cimport unique_ptr
 from libcpp.string cimport string
 from libcpp.vector cimport vector
@@ -11,10 +11,22 @@ from cudf._lib.cpp.types cimport size_type
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 
+cdef extern from "cudf/strings/udf/udf_string.hpp" namespace \
+        "cudf::strings::udf" nogil:
+    cdef cppclass udf_string
+
 cdef extern from "cudf/strings/udf/udf_apis.hpp"  namespace \
         "cudf::strings::udf" nogil:
     cdef unique_ptr[device_buffer] to_string_view_array(column_view) except +
+    cdef unique_ptr[column] column_from_udf_string_array(
+        udf_string* strings, size_type size,
+    ) except +
+    cdef void free_udf_string_array(
+        udf_string* strings, size_type size
+    ) except +
 
 cdef extern from "cudf/strings/detail/char_tables.hpp" namespace \
         "cudf::strings::detail" nogil:
     cdef const uint8_t* get_character_flags_table() except +
+    cdef const uint16_t* get_character_cases_table() except +
+    cdef const void* get_special_case_mapping_table() except +
diff --git a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx
index bb1892a4d26..bf459f22c16 100644
--- a/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx
+++ b/python/strings_udf/strings_udf/_lib/cudf_jit_udf.pyx
@@ -3,22 +3,39 @@
 from libcpp.memory cimport unique_ptr
 from libcpp.utility cimport move
 
-from cudf.core.buffer import Buffer
+from cudf.core.buffer import as_buffer
 
 from cudf._lib.column cimport Column
 from cudf._lib.cpp.column.column cimport column, column_view
 from rmm._lib.device_buffer cimport DeviceBuffer, device_buffer
 
 from strings_udf._lib.cpp.strings_udf cimport (
+    column_from_udf_string_array as cpp_column_from_udf_string_array,
+    free_udf_string_array as cpp_free_udf_string_array,
     to_string_view_array as cpp_to_string_view_array,
+    udf_string,
 )
 
 
-def to_string_view_array(Column strings_col):
+def column_to_string_view_array(Column strings_col):
     cdef unique_ptr[device_buffer] c_buffer
     cdef column_view input_view = strings_col.view()
     with nogil:
         c_buffer = move(cpp_to_string_view_array(input_view))
 
     device_buffer = DeviceBuffer.c_from_unique_ptr(move(c_buffer))
-    return Buffer(device_buffer)
+    return as_buffer(device_buffer, exposed=True)
+
+
+def column_from_udf_string_array(DeviceBuffer d_buffer):
+    cdef size_t size = int(d_buffer.c_size() / sizeof(udf_string))
+    cdef udf_string* data = <udf_string*>d_buffer.c_data()
+    cdef unique_ptr[column] c_result
+
+    with nogil:
+        c_result = move(cpp_column_from_udf_string_array(data, size))
+        cpp_free_udf_string_array(data, size)
+
+    result = Column.from_unique_ptr(move(c_result))
+
+    return result
diff --git a/python/strings_udf/strings_udf/_lib/tables.pyx b/python/strings_udf/strings_udf/_lib/tables.pyx
index 5443364a4a7..6442a34f63f 100644
--- a/python/strings_udf/strings_udf/_lib/tables.pyx
+++ b/python/strings_udf/strings_udf/_lib/tables.pyx
@@ -1,9 +1,11 @@
 # Copyright (c) 2022, NVIDIA CORPORATION.
 
-from libc.stdint cimport uint8_t, uintptr_t
+from libc.stdint cimport uint8_t, uint16_t, uintptr_t
 
 from strings_udf._lib.cpp.strings_udf cimport (
+    get_character_cases_table as cpp_get_character_cases_table,
     get_character_flags_table as cpp_get_character_flags_table,
+    get_special_case_mapping_table as cpp_get_special_case_mapping_table,
 )
 
 import numpy as np
@@ -11,4 +13,14 @@ import numpy as np
 
 def get_character_flags_table_ptr():
     cdef const uint8_t* tbl_ptr = cpp_get_character_flags_table()
-    return np.int64(<uintptr_t>tbl_ptr)
+    return np.uintp(<uintptr_t>tbl_ptr)
+
+
+def get_character_cases_table_ptr():
+    cdef const uint16_t* tbl_ptr = cpp_get_character_cases_table()
+    return np.uintp(<uintptr_t>tbl_ptr)
+
+
+def get_special_case_mapping_table_ptr():
+    cdef const void* tbl_ptr = cpp_get_special_case_mapping_table()
+    return np.uintp(<uintptr_t>tbl_ptr)
diff --git a/python/strings_udf/strings_udf/_typing.py b/python/strings_udf/strings_udf/_typing.py
index 675507bccde..3fadf030ce9 100644
--- a/python/strings_udf/strings_udf/_typing.py
+++ b/python/strings_udf/strings_udf/_typing.py
@@ -3,6 +3,7 @@
 import operator
 
 import llvmlite.binding as ll
+import numpy as np
 from numba import types
 from numba.core.datamodel import default_manager
 from numba.core.extending import models, register_model
@@ -23,19 +24,33 @@
 
 
 # String object definitions
-class DString(types.Type):
+class UDFString(types.Type):
+
+    np_dtype = np.dtype("object")
+
     def __init__(self):
-        super().__init__(name="dstring")
+        super().__init__(name="udf_string")
         llty = default_manager[self].get_value_type()
         self.size_bytes = llty.get_abi_size(target_data)
 
+    @property
+    def return_type(self):
+        return self
+
 
 class StringView(types.Type):
+
+    np_dtype = np.dtype("object")
+
     def __init__(self):
         super().__init__(name="string_view")
         llty = default_manager[self].get_value_type()
         self.size_bytes = llty.get_abi_size(target_data)
 
+    @property
+    def return_type(self):
+        return UDFString()
+
 
 @register_model(StringView)
 class stringview_model(models.StructModel):
@@ -56,9 +71,9 @@ def __init__(self, dmm, fe_type):
         super().__init__(dmm, fe_type, self._members)
 
 
-@register_model(DString)
-class dstring_model(models.StructModel):
-    # from dstring.hpp:
+@register_model(UDFString)
+class udf_string_model(models.StructModel):
+    # from udf_string.hpp:
     # private:
     #   char* m_data{};
     #   cudf::size_type m_bytes{};
@@ -74,8 +89,9 @@ def __init__(self, dmm, fe_type):
         super().__init__(dmm, fe_type, self._members)
 
 
-any_string_ty = (StringView, DString, types.StringLiteral)
+any_string_ty = (StringView, UDFString, types.StringLiteral)
 string_view = StringView()
+udf_string = UDFString()
 
 
 class StrViewArgHandler:
@@ -93,7 +109,9 @@ class StrViewArgHandler:
     """
 
     def prepare_args(self, ty, val, **kwargs):
-        if isinstance(ty, types.CPointer) and isinstance(ty.dtype, StringView):
+        if isinstance(ty, types.CPointer) and isinstance(
+            ty.dtype, (StringView, UDFString)
+        ):
             return types.uint64, val.ptr
         else:
             return ty, val
@@ -113,7 +131,7 @@ def generic(self, args, kws):
         if isinstance(args[0], any_string_ty) and len(args) == 1:
             # length:
             # string_view -> int32
-            # dstring -> int32
+            # udf_string -> int32
             # literal -> int32
             return nb_signature(size_type, args[0])
 
@@ -141,8 +159,13 @@ def generic(self, args, kws):
 register_stringview_binaryop(operator.gt, types.boolean)
 register_stringview_binaryop(operator.le, types.boolean)
 register_stringview_binaryop(operator.ge, types.boolean)
+
+# st in other
 register_stringview_binaryop(operator.contains, types.boolean)
 
+# st + other
+register_stringview_binaryop(operator.add, udf_string)
+
 
 def create_binary_attr(attrname, retty):
     """
@@ -163,7 +186,7 @@ def attr(self, mod):
     return attr
 
 
-def create_identifier_attr(attrname):
+def create_identifier_attr(attrname, retty):
     """
     Helper function wrapping numba's low level extension API. Provides
     the boilerplate needed to register a unary function of a string
@@ -174,7 +197,7 @@ class StringViewIdentifierAttr(AbstractTemplate):
         key = f"StringView.{attrname}"
 
         def generic(self, args, kws):
-            return nb_signature(types.boolean, recvr=self.this)
+            return nb_signature(retty, recvr=self.this)
 
     def attr(self, mod):
         return types.BoundFunction(StringViewIdentifierAttr, string_view)
@@ -211,6 +234,8 @@ def resolve_count(self, mod):
     "isnumeric",
     "istitle",
 ]
+string_unary_funcs = ["upper", "lower"]
+string_return_attrs = ["strip", "lstrip", "rstrip"]
 
 for func in bool_binary_funcs:
     setattr(
@@ -219,12 +244,31 @@ def resolve_count(self, mod):
         create_binary_attr(func, types.boolean),
     )
 
+for func in string_return_attrs:
+    setattr(
+        StringViewAttrs,
+        f"resolve_{func}",
+        create_binary_attr(func, udf_string),
+    )
+
+
 for func in int_binary_funcs:
     setattr(
         StringViewAttrs, f"resolve_{func}", create_binary_attr(func, size_type)
     )
 
 for func in id_unary_funcs:
-    setattr(StringViewAttrs, f"resolve_{func}", create_identifier_attr(func))
+    setattr(
+        StringViewAttrs,
+        f"resolve_{func}",
+        create_identifier_attr(func, types.boolean),
+    )
+
+for func in string_unary_funcs:
+    setattr(
+        StringViewAttrs,
+        f"resolve_{func}",
+        create_identifier_attr(func, udf_string),
+    )
 
 cuda_decl_registry.register_attr(StringViewAttrs)
diff --git a/python/strings_udf/strings_udf/lowering.py b/python/strings_udf/strings_udf/lowering.py
index df0902dfa98..cca3066a844 100644
--- a/python/strings_udf/strings_udf/lowering.py
+++ b/python/strings_udf/strings_udf/lowering.py
@@ -5,6 +5,7 @@
 
 from numba import cuda, types
 from numba.core import cgutils
+from numba.core.datamodel import default_manager
 from numba.core.typing import signature as nb_signature
 from numba.cuda.cudadrv import nvvm
 from numba.cuda.cudaimpl import (
@@ -12,23 +13,27 @@
     registry as cuda_lowering_registry,
 )
 
-from strings_udf._lib.tables import get_character_flags_table_ptr
-from strings_udf._typing import size_type, string_view
+from strings_udf._lib.tables import (
+    get_character_cases_table_ptr,
+    get_character_flags_table_ptr,
+    get_special_case_mapping_table_ptr,
+)
+from strings_udf._typing import size_type, string_view, udf_string
 
 character_flags_table_ptr = get_character_flags_table_ptr()
-
-
-# read-only functions
-# We will provide only one overload for this set of functions, which will
-# expect a string_view. When a literal is encountered, numba will promote it to
-# a string_view whereas when a dstring is encountered, numba will convert it to
-# a view via its native view() method.
+character_cases_table_ptr = get_character_cases_table_ptr()
+special_case_mapping_table_ptr = get_special_case_mapping_table_ptr()
 
 _STR_VIEW_PTR = types.CPointer(string_view)
+_UDF_STRING_PTR = types.CPointer(udf_string)
 
 
 # CUDA function declarations
+# read-only (input is a string_view, output is a fixed with type)
 _string_view_len = cuda.declare_device("len", size_type(_STR_VIEW_PTR))
+_concat_string_view = cuda.declare_device(
+    "concat", types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR)
+)
 
 
 def _declare_binary_func(lhs, rhs, out, name):
@@ -39,6 +44,12 @@ def _declare_binary_func(lhs, rhs, out, name):
     )
 
 
+def _declare_strip_func(name):
+    return cuda.declare_device(
+        name, size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR)
+    )
+
+
 # A binary function of the form f(string, string) -> bool
 _declare_bool_str_str_func = partial(
     _declare_binary_func, _STR_VIEW_PTR, _STR_VIEW_PTR, types.boolean
@@ -60,6 +71,9 @@ def _declare_binary_func(lhs, rhs, out, name):
 _string_view_find = _declare_size_type_str_str_func("find")
 _string_view_rfind = _declare_size_type_str_str_func("rfind")
 _string_view_contains = _declare_bool_str_str_func("contains")
+_string_view_strip = _declare_strip_func("strip")
+_string_view_lstrip = _declare_strip_func("lstrip")
+_string_view_rstrip = _declare_strip_func("rstrip")
 
 
 # A binary function of the form f(string, int) -> bool
@@ -68,6 +82,19 @@ def _declare_binary_func(lhs, rhs, out, name):
 )
 
 
+def _declare_upper_or_lower(func):
+    return cuda.declare_device(
+        func,
+        types.void(
+            _UDF_STRING_PTR,
+            _STR_VIEW_PTR,
+            types.uintp,
+            types.uintp,
+            types.uintp,
+        ),
+    )
+
+
 _string_view_isdigit = _declare_bool_str_int_func("pyisdigit")
 _string_view_isalnum = _declare_bool_str_int_func("pyisalnum")
 _string_view_isalpha = _declare_bool_str_int_func("pyisalpha")
@@ -77,6 +104,8 @@ def _declare_binary_func(lhs, rhs, out, name):
 _string_view_isupper = _declare_bool_str_int_func("pyisupper")
 _string_view_islower = _declare_bool_str_int_func("pyislower")
 _string_view_istitle = _declare_bool_str_int_func("pyistitle")
+_string_view_upper = _declare_upper_or_lower("upper")
+_string_view_lower = _declare_upper_or_lower("lower")
 
 
 _string_view_count = cuda.declare_device(
@@ -107,6 +136,35 @@ def cast_string_literal_to_string_view(context, builder, fromty, toty, val):
     return sv._getvalue()
 
 
+@cuda_lowering_registry.lower_cast(string_view, udf_string)
+def cast_string_view_to_udf_string(context, builder, fromty, toty, val):
+    sv_ptr = builder.alloca(default_manager[fromty].get_value_type())
+    udf_str_ptr = builder.alloca(default_manager[toty].get_value_type())
+    builder.store(val, sv_ptr)
+    _ = context.compile_internal(
+        builder,
+        call_create_udf_string_from_string_view,
+        nb_signature(types.void, _STR_VIEW_PTR, types.CPointer(udf_string)),
+        (sv_ptr, udf_str_ptr),
+    )
+    result = cgutils.create_struct_proxy(udf_string)(
+        context, builder, value=builder.load(udf_str_ptr)
+    )
+
+    return result._getvalue()
+
+
+# utilities
+_create_udf_string_from_string_view = cuda.declare_device(
+    "udf_string_from_string_view",
+    types.void(types.CPointer(string_view), types.CPointer(udf_string)),
+)
+
+
+def call_create_udf_string_from_string_view(sv, udf_str):
+    _create_udf_string_from_string_view(sv, udf_str)
+
+
 # String function implementations
 def call_len_string_view(st):
     return _string_view_len(st)
@@ -126,6 +184,31 @@ def len_impl(context, builder, sig, args):
     return result
 
 
+def call_concat_string_view(result, lhs, rhs):
+    return _concat_string_view(result, lhs, rhs)
+
+
+@cuda_lower(operator.add, string_view, string_view)
+def concat_impl(context, builder, sig, args):
+    lhs_ptr = builder.alloca(args[0].type)
+    rhs_ptr = builder.alloca(args[1].type)
+    builder.store(args[0], lhs_ptr)
+    builder.store(args[1], rhs_ptr)
+
+    udf_str_ptr = builder.alloca(default_manager[udf_string].get_value_type())
+    _ = context.compile_internal(
+        builder,
+        call_concat_string_view,
+        types.void(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR),
+        (udf_str_ptr, lhs_ptr, rhs_ptr),
+    )
+
+    result = cgutils.create_struct_proxy(udf_string)(
+        context, builder, value=builder.load(udf_str_ptr)
+    )
+    return result._getvalue()
+
+
 def create_binary_string_func(binary_func, retty):
     """
     Provide a wrapper around numba's low-level extension API which
@@ -138,17 +221,43 @@ def deco(cuda_func):
         def binary_func_impl(context, builder, sig, args):
             lhs_ptr = builder.alloca(args[0].type)
             rhs_ptr = builder.alloca(args[1].type)
-
             builder.store(args[0], lhs_ptr)
             builder.store(args[1], rhs_ptr)
-            result = context.compile_internal(
-                builder,
-                cuda_func,
-                nb_signature(retty, _STR_VIEW_PTR, _STR_VIEW_PTR),
-                (lhs_ptr, rhs_ptr),
-            )
 
-            return result
+            # these conditional statements should compile out
+            if retty != udf_string:
+                # binary function of two strings yielding a fixed-width type
+                # example: str.startswith(other) -> bool
+                # shim functions can return the value through nb_retval
+                result = context.compile_internal(
+                    builder,
+                    cuda_func,
+                    nb_signature(retty, _STR_VIEW_PTR, _STR_VIEW_PTR),
+                    (lhs_ptr, rhs_ptr),
+                )
+                return result
+            else:
+                # binary function of two strings yielding a new string
+                # example: str.strip(other) -> str
+                # shim functions can not return a struct due to C linkage
+                # so we create a new udf_string and pass a pointer to it
+                # for the shim function to write the output to. The return
+                # value of compile_internal is therefore discarded (although
+                # this may change in the future if we need to return error
+                # codes, for instance).
+                udf_str_ptr = builder.alloca(
+                    default_manager[udf_string].get_value_type()
+                )
+                _ = context.compile_internal(
+                    builder,
+                    cuda_func,
+                    size_type(_UDF_STRING_PTR, _STR_VIEW_PTR, _STR_VIEW_PTR),
+                    (udf_str_ptr, lhs_ptr, rhs_ptr),
+                )
+                result = cgutils.create_struct_proxy(udf_string)(
+                    context, builder, value=builder.load(udf_str_ptr)
+                )
+                return result._getvalue()
 
         return binary_func_impl
 
@@ -190,6 +299,21 @@ def lt_impl(st, rhs):
     return _string_view_lt(st, rhs)
 
 
+@create_binary_string_func("StringView.strip", udf_string)
+def strip_impl(result, to_strip, strip_char):
+    return _string_view_strip(result, to_strip, strip_char)
+
+
+@create_binary_string_func("StringView.lstrip", udf_string)
+def lstrip_impl(result, to_strip, strip_char):
+    return _string_view_lstrip(result, to_strip, strip_char)
+
+
+@create_binary_string_func("StringView.rstrip", udf_string)
+def rstrip_impl(result, to_strip, strip_char):
+    return _string_view_rstrip(result, to_strip, strip_char)
+
+
 @create_binary_string_func("StringView.startswith", types.boolean)
 def startswith_impl(sv, substr):
     return _string_view_startswith(sv, substr)
@@ -232,12 +356,12 @@ def id_func_impl(context, builder, sig, args):
             # must be resolved at runtime after context initialization,
             # therefore cannot be a global variable
             tbl_ptr = context.get_constant(
-                types.int64, character_flags_table_ptr
+                types.uintp, character_flags_table_ptr
             )
             result = context.compile_internal(
                 builder,
                 cuda_func,
-                nb_signature(types.boolean, _STR_VIEW_PTR, types.int64),
+                nb_signature(types.boolean, _STR_VIEW_PTR, types.uintp),
                 (str_ptr, tbl_ptr),
             )
 
@@ -248,6 +372,74 @@ def id_func_impl(context, builder, sig, args):
     return deco
 
 
+def create_upper_or_lower(id_func):
+    """
+    Provide a wrapper around numba's low-level extension API which
+    produces the boilerplate needed to implement either the upper
+    or lower attrs of a string view.
+    """
+
+    def deco(cuda_func):
+        @cuda_lower(id_func, string_view)
+        def id_func_impl(context, builder, sig, args):
+            str_ptr = builder.alloca(args[0].type)
+            builder.store(args[0], str_ptr)
+
+            # Lookup table required for conversion functions
+            # must be resolved at runtime after context initialization,
+            # therefore cannot be a global variable
+            flags_tbl_ptr = context.get_constant(
+                types.uintp, character_flags_table_ptr
+            )
+            cases_tbl_ptr = context.get_constant(
+                types.uintp, character_cases_table_ptr
+            )
+            special_tbl_ptr = context.get_constant(
+                types.uintp, special_case_mapping_table_ptr
+            )
+            udf_str_ptr = builder.alloca(
+                default_manager[udf_string].get_value_type()
+            )
+
+            _ = context.compile_internal(
+                builder,
+                cuda_func,
+                types.void(
+                    _UDF_STRING_PTR,
+                    _STR_VIEW_PTR,
+                    types.uintp,
+                    types.uintp,
+                    types.uintp,
+                ),
+                (
+                    udf_str_ptr,
+                    str_ptr,
+                    flags_tbl_ptr,
+                    cases_tbl_ptr,
+                    special_tbl_ptr,
+                ),
+            )
+
+            result = cgutils.create_struct_proxy(udf_string)(
+                context, builder, value=builder.load(udf_str_ptr)
+            )
+            return result._getvalue()
+
+        return id_func_impl
+
+    return deco
+
+
+@create_upper_or_lower("StringView.upper")
+def upper_impl(result, st, flags, cases, special):
+    return _string_view_upper(result, st, flags, cases, special)
+
+
+@create_upper_or_lower("StringView.lower")
+def lower_impl(result, st, flags, cases, special):
+    return _string_view_lower(result, st, flags, cases, special)
+
+
 @create_unary_identifier_func("StringView.isdigit")
 def isdigit_impl(st, tbl):
     return _string_view_isdigit(st, tbl)
diff --git a/python/strings_udf/strings_udf/tests/test_string_udfs.py b/python/strings_udf/strings_udf/tests/test_string_udfs.py
index f214915ae12..02c3a8b8c12 100644
--- a/python/strings_udf/strings_udf/tests/test_string_udfs.py
+++ b/python/strings_udf/strings_udf/tests/test_string_udfs.py
@@ -9,17 +9,18 @@
 from numba.types import CPointer, void
 
 import cudf
+import rmm
 from cudf.testing._utils import assert_eq
 
 import strings_udf
-from strings_udf._lib.cudf_jit_udf import to_string_view_array
-from strings_udf._typing import str_view_arg_handler, string_view
+from strings_udf._lib.cudf_jit_udf import (
+    column_from_udf_string_array,
+    column_to_string_view_array,
+)
+from strings_udf._typing import str_view_arg_handler, string_view, udf_string
 
-if not strings_udf.ENABLED:
-    pytest.skip("Strings UDF not enabled.", allow_module_level=True)
 
-
-def get_kernel(func, dtype):
+def get_kernel(func, dtype, size):
     """
     Create a kernel for testing a single scalar string function
     Allocates an output vector with a dtype specified by the caller
@@ -28,15 +29,19 @@ def get_kernel(func, dtype):
     """
 
     func = cuda.jit(device=True)(func)
-    outty = numba.np.numpy_support.from_dtype(dtype)
-    sig = nb_signature(void, CPointer(string_view), outty[::1])
+
+    if dtype == "str":
+        outty = CPointer(udf_string)
+    else:
+        outty = numba.np.numpy_support.from_dtype(dtype)[::1]
+    sig = nb_signature(void, CPointer(string_view), outty)
 
     @cuda.jit(
         sig, link=[strings_udf.ptxpath], extensions=[str_view_arg_handler]
     )
     def kernel(input_strings, output_col):
         id = cuda.grid(1)
-        if id < len(output_col):
+        if id < size:
             st = input_strings[id]
             result = func(st)
             output_col[id] = result
@@ -53,15 +58,22 @@ def run_udf_test(data, func, dtype):
     and then assembles the result back into a cuDF series before
     comparing it with the equivalent pandas result
     """
-    dtype = np.dtype(dtype)
+    if dtype == "str":
+        output = rmm.DeviceBuffer(size=len(data) * udf_string.size_bytes)
+    else:
+        dtype = np.dtype(dtype)
+        output = cudf.core.column.column_empty(len(data), dtype=dtype)
+
     cudf_column = cudf.core.column.as_column(data)
-    str_view_ary = to_string_view_array(cudf_column)
+    str_views = column_to_string_view_array(cudf_column)
 
-    output_ary = cudf.core.column.column_empty(len(data), dtype=dtype)
+    kernel = get_kernel(func, dtype, len(data))
+    kernel.forall(len(data))(str_views, output)
 
-    kernel = get_kernel(func, dtype)
-    kernel.forall(len(data))(str_view_ary, output_ary)
-    got = cudf.Series(output_ary, dtype=dtype)
+    if dtype == "str":
+        output = column_from_udf_string_array(output)
+
+    got = cudf.Series(output, dtype=dtype)
     expect = pd.Series(data).apply(func)
     assert_eq(expect, got, check_dtype=False)
 
@@ -259,3 +271,64 @@ def func(st):
         return st.startswith(substr)
 
     run_udf_test(data, func, "bool")
+
+
+def test_string_udf_return_string(data):
+    def func(st):
+        return st
+
+    run_udf_test(data, func, "str")
+
+
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_strip(data, strip_char):
+    def func(st):
+        return st.strip(strip_char)
+
+    run_udf_test(data, func, "str")
+
+
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_lstrip(data, strip_char):
+    def func(st):
+        return st.lstrip(strip_char)
+
+    run_udf_test(data, func, "str")
+
+
+@pytest.mark.parametrize("strip_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_rstrip(data, strip_char):
+    def func(st):
+        return st.rstrip(strip_char)
+
+    run_udf_test(data, func, "str")
+
+
+def test_string_udf_upper(data):
+    def func(st):
+        return st.upper()
+
+    run_udf_test(data, func, "str")
+
+
+def test_string_udf_lower(data):
+    def func(st):
+        return st.lower()
+
+    run_udf_test(data, func, "str")
+
+
+@pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_concat(data, concat_char):
+    def func(st):
+        return st + concat_char
+
+    run_udf_test(data, func, "str")
+
+
+@pytest.mark.parametrize("concat_char", ["1", "a", "12", " ", "", ".", "@"])
+def test_string_udf_concat_reflected(data, concat_char):
+    def func(st):
+        return concat_char + st
+
+    run_udf_test(data, func, "str")
diff --git a/setup.cfg b/setup.cfg
index d196e8605b2..d810178c44b 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,8 +1,9 @@
 # Copyright (c) 2017-2022, NVIDIA CORPORATION.
 
 [flake8]
-filename = *.py, *.pyx, *.pxd
+filename = *.py, *.pyx, *.pxd, *.pxi
 exclude = __init__.py, *.egg, build, docs, .git
+force-check = True
 ignore =
     # line break before binary operator
     W503,
@@ -14,11 +15,13 @@ per-file-ignores =
     # E225: Missing whitespace around operators (breaks cython casting syntax like <int>)
     # E226: Missing whitespace around arithmetic operators (breaks cython pointer syntax like int*)
     # E227: Missing whitespace around bitwise or shift operator (Can also break casting syntax)
+    # E275: Missing whitespace after keyword (Doesn't work with Cython except?)
     # E402: invalid syntax (works for Python, not Cython)
     # E999: invalid syntax (works for Python, not Cython)
     # W504: line break after binary operator (breaks lines that end with a pointer)
-    *.pyx: E211, E225, E226, E227, E402, E999, W504
-    *.pxd: E211, E225, E226, E227, E402, E999, W504
+    *.pyx: E211, E225, E226, E227, E275, E402, E999, W504
+    *.pxd: E211, E225, E226, E227, E275, E402, E999, W504
+    *.pxi: E211, E225, E226, E227, E275, E402, E999, W504
 
 [pydocstyle]
 # Due to https://github.com/PyCQA/pydocstyle/issues/363, we must exclude rather